提交 e694b41b 作者: XveLingKun

5/10

上级 6da55a3e
...@@ -891,6 +891,7 @@ class BaseCore: ...@@ -891,6 +891,7 @@ class BaseCore:
page_size = doc.page_count page_size = doc.page_count
for page in doc.pages(): for page in doc.pages():
retData['content'] += page.get_text() retData['content'] += page.get_text()
except: except:
log = self.getLogger() log = self.getLogger()
log.error(f'文件损坏') log.error(f'文件损坏')
......
...@@ -324,7 +324,7 @@ def AnnualEnterprise(): ...@@ -324,7 +324,7 @@ def AnnualEnterprise():
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
print('=======') print('=======')
for item in gn_social_list: for item in gn_social_list:
r.rpush('AnnualEnterprise:gnqy_socialCode', item) r.rpush('AnnualEnterprise:zjh_socialCode', item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
#企业年报定时任务 #企业年报定时任务
...@@ -514,7 +514,7 @@ def NQEnterprise(): ...@@ -514,7 +514,7 @@ def NQEnterprise():
for item in nq_social_list: for item in nq_social_list:
#新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis #新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis
# r.rpush('NQEnterprise:nq_Ipo', item) # r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item) r.rpush('NQEnterprise:nq_finance', item)
# r.rpush('NQEnterprise:nq_notice',item) # r.rpush('NQEnterprise:nq_notice',item)
closeSql(cnx_, cursor_) closeSql(cnx_, cursor_)
...@@ -674,10 +674,11 @@ if __name__ == "__main__": ...@@ -674,10 +674,11 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
# MengZhi() # MengZhi()
NQEnterprise() # NQEnterprise()
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
AnnualEnterprise()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
......
...@@ -88,6 +88,12 @@ def doJob(): ...@@ -88,6 +88,12 @@ def doJob():
'version': 'TYC-Web' 'version': 'TYC-Web'
} }
cookies_list, id_cookie, user_name = token.get_cookies() cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======') log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {} cookies = {}
for cookie in cookies_list: for cookie in cookies_list:
...@@ -97,7 +103,7 @@ def doJob(): ...@@ -97,7 +103,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
social_code = '91110108780992804C' social_code = '91370212MA3MJMA0XW'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
......
...@@ -26,7 +26,7 @@ if __name__ == "__main__": ...@@ -26,7 +26,7 @@ if __name__ == "__main__":
name = input('所属用户:') name = input('所属用户:')
driver = create_driver() driver = create_driver()
driver.get(url) driver.get(url)
time.sleep(80) time.sleep(60)
cookies = driver.get_cookies() cookies = driver.get_cookies()
# print(driver.get_cookies()) # print(driver.get_cookies())
......
title gwyRelevantDocuments
chcp 65001
cd /d %~dp0
python38 gwyRelevantDocuments.py
\ No newline at end of file
title gwyfile
chcp 65001
cd /d %~dp0
python38 gwyfile.py
\ No newline at end of file
title gwyparts
chcp 65001
cd /d %~dp0
python38 gwyparts.py
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
def query():
sql = "select id from wx_link where state = '300' order by publish_time desc"
cursor.execute(sql)
result = cursor.fetchall()
return result
if __name__ == "__main__":
result = query()
# result = ['1990264','1990265','1998085','1998086','2039312','2067942','2087699','2087700','2087701','2087774','2087775','2087776','2087777','2088091','2088092','2088093','2088445','2088446','2088447','2088455','2121977','2385237','2385238','2386227','2678376','2678377','2678421','2678422','2678425','2731944','2731945','2731946','2732184','2732185','2732205','2732206','2732313','2732314','2732317','2732318','2732319','2732320','2732321','2732323','2732438','2732439','2732440','2732453','2732455','2732456','2732483','2732497','2732910','2732911','2732912','2732913','2732915','2732918','2732952','2732953','2732958','2732959','2732960','2733052','2733053','2733097','2733100','2733101','2733120','2733121','2733123','2733124','2733127','2733128','2733130','2733146','2733147','2733148','2733149','2733150','2733151','2733152','2733153','2733154','2733155','2733156','2733157','2733328','2733345','2733346','2733515','2733518','2733519','2733534','2733536','2733537','2733565','2733566','2733595','2733596','2733598','2733627','2733703','2733705','2733706','2733814','2733958','2733959','2733960','2734035','2734062','2734113','2734180','2734182','2734270','2734271','2734272','2734273','2734274','2734275','2734276','2734307','2734308','2734311','2734312','2734313','2734314','2734315','2734316','2734317','2734324','2734325','2734326','2734328','2734329','2734330','2734339','2734340','2734341','2734388','2734389','2734536','2734537','2734538','2735181','2735182','2735183','2735184','2735185','2735186','2735187','2735188','2735190','2735191','2735194','2735196','2735266','2735267','2735268','2735269','2735270','2735271','2735272','2735276','2735277','2735278','2735279','2735280','2735281','2735282','2735283','2735297','2735561','2735625','2735627','2735628','2735662','2735663','2736211','2736212','2736213','2736214','2736215','2736216','2736544','2736545','2736546','2736559','2736677','2736678','2736817','2736819','2736820','2736821','2736823','2736824','2736825','2736828','2736905','2736906','2736907','2736912','2736913','2736914','2736915','2736916','2736917','2736963','2736964','2736988','2736989','2736990','2736991','2737108','2737111','2737600','2737601','2737604','2737701','2737702','2737703','2737759','2737928','2737930','2737931','2738046','2738050','2738051','2738052','2738053','2738356','2738357','2738358','2738456','2738460','2738461','2738485','2738486','2738607','2738608','2738609','2739613','2739614','2739615','2739649','2739650','2739651','2739908','2739909','2739910','2739911','2739912','2739913','2740019','2740022','2740023','2740123','2740207','2740208','2740209','2740252','2740255','2740256','2740269','2740270','2740271','2740412','2740413','2740485','2740486','2740487','2740535','2740536','2740537','2740538','2740539','2740540','2740541','2740542','2740543','2740544','2740545','2740546','2740547','2740548','2740549','2740659','2740660','2740661','2740662','2740663','2740664','2740924','2740926','2740927','2740964','2740965','2740966','2741091','2741092','2741093','2741098','2741099','2741100','2741129','2741130','2741131','2744702','2744703','2744704','2744705','2759363','2759364','2759365','2759546','2759547','2759548','2759549','2759550','2759551','2759552','2759553','2759554','2759555','2759556','2759800','2759801','2759802','2759803','2759805','2759806','2759829','2760062','2760063','2760064','2760729','2760730','2760733','2760899','2760900','2760902','2760903','2760904','2760905','2761327','2761328','2761332','2761783','2761784','2761785','2761795','2761797','2761799','2761805','2761819','2761820','2761821','2761822','2761891','2761892','2761893','2761894','2761895','2761896','2761897','2761900','2762070','2762071','2762072','2762073','2762074','2762075','2762076','2762077','2762078','2762079','2762080','2762081','2762082','2762083','2762084','2762085','2762087','2762088','2762089','2762090','2762091','2762092','2762093','2762125','2762126','2762127','2762137','2762138','2762139','2762160','2762161','2762162','2762195','2762196','2762197','2762410','2762411','2762419','2762470','2762471','2762472','2762873','2762875','2762877','2762930','2762931','2762937','2762938','2762939','2762940','2762941','2763276','2763277','2763278','2763304','2763305','2763307','2763308','2763309','2763310','2763312','2763697','2763698','2763699','2763700','2763701','2763702','2763703','2764035','2764036','2764037','2764039','2764040','2764041','2764042','2764043','2764044','2764045','2764046','2764047','2764048','2764049','2764050','2764051','2764055','2764056','2764057','2764059','2764060','2764062','2764063','2764064','2764065','2764164','2764165','2764369','2764370','2764567','2764568','2764570','2764618','2764619','2764620','2764744','2764745','2764748','2764770','2764771','2764772','2764869','2764870','2764871','2764898','2764899','2764900','2764901','2764902','2764903','2764904','2764905','2764906','2764907','2764908','2764909','2764910','2764911','2764912','2764913','2764914','2764915','2764916','2764917','2764918','2764933','2764934','2764935','2764936','2764937','2764938','2764939','2764957','2764958','2764959','2764960','2764961','2764963','2764964','2764965','2764966','2765020','2765021','2765022','2765023','2765024','2765026','2765229','2765230','2765231','2765232','2765233','2765293','2765294','2765295','2765296','2765297','2765298','2765299','2765300','2765301','2765302','2765303','2765304','2765305','2765306','2765307','2765308','2765414','2765416','2765424','2765571','2765572','2765573','2765796','2765797','2765798','2765804','2765805','2765807','2765808','2765809','2765810','2765811','2765812','2765813','2765814','2765815','2765816','2765820','2765821','2765822','2766021','2766022','2766023','2766024','2766025','2766048','2766060','2766061','2766062','2766063','2766064','2766066','2766068','2766069','2766071','2766072','2766073','2766074','2766075','2766169','2766194','2766195','2766197','2766208','2766209','2766244','2766245','2766246','2766536','2766537','2766538','2766539','2766540','2766547','2766669','2766670','2766671','2766673','2766674','2766675','2766676','2766677','2766678','2766679','2766680','2766681','2766682','2766790','2766792','2766826','2766827','2767032','2767120','2767121','2767122','2767123','2767126','2767127','2767128','2767129','2767130','2767131','2767132','2767133','2767134','2767135','2767136','2767137','2767138','2767139','2767173','2767174','2767408','2767409','2767410','2767411','2767502','2767503','2767534','2767535','2767545','2767546','2767547','2767548','2767600','2767602','2767642','2767643','2767655','2767656','2767717','2767718','2767719','2767720','2767732','2767740','2767741','2767756','2767758','2767766','2767767','2767807','2767808','2767809','2767810','2767817','2767818','2767825','2767827','2767828','2767829','2767840','2767887','2767898','2767899','2767900','2767901','2767902','2767903','2767906','2767907','2767908','2767955','2768155','2768156','2768166','2768167','2768168','2768170','2768284','2768360','2768368','2768378','2768826','2768827','2768845','2768846','2768847','2768848','2768849','2768850','2768851','2768852','2768871','2768872','2768877','2768878','2768879','2768880','2768888','2768912','2768913','2768914','2768916','2768917','2768918','2768919','2768920','2768921','2768922','2768923','2768924','2769071','2769075','2769240','2769258','2769270','2769271','2769272','2769399','2769400','2769427','2769428','2769429','2769430','2769489','2769491','2769542','2769543','2769568','2769569','2770721','2770722','2770723','2770724','2770725','2770726','2770728','2770729','2770730','2770731','2770732','2770733','2770734','2770738','2770739','2770793','2770796','2770903','2770904','2770906','2770907','2770909','2770977','2770978','2771278','2771280','2771661','2771662','2771929','2771932','2772086','2772087','2772088','2772089','2772090','2772130','2772132','2772319','2772320','2772409','2772410','2772423','2772424','2772425','2772426','2772630','2772632','2772721','2772723','2772724','2772725','2772737','2772738','2772749','2772750','2772751','2772752','2772753','2773224','2773227','2773253','2773254','2773287','2773289','2773346','2773347','2773348','2773349','2773350','2773351','2773385','2773386','2773387','2773388','2773558','2773559','2773563','2773564','2773565','2773566','2773567','2773568','2773777','2773778','2773805','2773806','2773808','2773834','2773836','2773837','2773838','2773839','2773841','2774222','2774223','2774302','2774303','2774319','2774320','2774334','2774335','2774440','2774512','2774513','2774527','2774528','2774529','2774531','2774660','2774807','2774830','2774847','2775022','2775023','2775164','2775165','2775350','2775351','2775386','2775387','2775482','2775483','2775677','2775680','2776678','2776681','2776869','2776872','2776873','2776881','2776882','2776883','2776884','2776885','2776993','2776994','2776996','2776997','2777106','2777115','2777123','2777124','2777125','2777126','2777127','2777128','2777129','2777130','2777131','2777132','2777237','2777240','2777814','2777815','2777817','2777819','2777820','2777821','2777822','2777823','2777824','2777825','2777826','2777827','2777828','2777829','2777830','2778099','2778100','2778267','2778268','2778484','2778486','2778601','2778603','2778707','2778708','2779029','2779030','2779031','2779032','2779033','2779034','2779035','2779047','2779048','2779253','2779369','2779371','2779528','2779529','2779530','2779533','2779591','2779592','2779790','2779791','2780196','2780501','2780504']
for id in result:
# 放入redis
r.rpush("WeiXinGZH:linkid_fail", id[0])
# r.rpush("WeiXinGZH:linkid_fail", id)
\ No newline at end of file
"""获取每天失败的列表--返回给数据组"""
import datetime
import time
import pandas as pd
import smtplib
from email.header import Header
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import pymysql
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# cnx = baseCore.cnx
# cursor = baseCore.cursor
def sendEmail(file_name, receiver, filename):
file = open(file_name, 'rb').read()
# 发送邮箱地址
sender = '1195236739@qq.com'
# 发送邮箱登录 账户 密码
username = '1195236739@qq.com'
password = 'gatvszshadvpgjci'
smtpserver = 'smtp.qq.com'
# # 接收邮箱地址
# receiver = 'fujunxue@ciglobal.cn'
maile_title = filename
message = MIMEMultipart()
message['From'] = sender
message['To'] = receiver
message['Subject'] = Header(maile_title, 'utf-8')
message.attach(MIMEText(filename, 'plain', 'utf-8'))
xlsxApart = MIMEApplication(file)
xlsxApart.add_header('Content-Disposition', 'attachment', filename=filename)
message.attach(xlsxApart)
smtpObj = smtplib.SMTP_SSL(smtpserver) # 注意:如果遇到发送失败的情况(提示远程主机拒接连接),这里要使用SMTP_SSL方法
smtpObj.connect(smtpserver, port=465)
smtpObj.login(username, password)
smtpObj.sendmail(sender, receiver, message.as_string())
print("邮件发送成功!!!")
smtpObj.quit()
# 解析失败
def get_failed_list(today_time):
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
sql = f"select * from wx_link where (state = '300' or state = '600' and state = '200') and create_time >= '{today_time}' "
# sql = f"select * from wx_link where state='800' "
print(sql)
cursor.execute(sql)
result = cursor.fetchall()
cursor.close()
cnx.close()
return result
# 发布内容不存在
def get_null_list(today_time):
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
sql = f"select * from wx_link where (state='800' or state='500') and create_time >= '{today_time}' "
cursor.execute(sql)
result = cursor.fetchall()
cursor.close()
cnx.close()
return result
def get_info(result):
fail_list = []
for info in result:
site_name = info[3] # 公众号
info_source_code = info[4]
title = info[5]
publish_time = info[6]
link = info[7]
# 写入detaframe
# 创建一个字典,其中包含当前行的数据
row = {
'公众号': site_name,
'公众号编码': info_source_code,
'标题': title,
'发布时间': publish_time,
'链接': link
}
fail_list.append(row)
return fail_list
if __name__ == "__main__":
# 创建一个空的DataFrame,其中包含你需要的列名
while True:
# 计算今天的时间
now = datetime.datetime.now()
print(now)
time.sleep(1)
print(datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0))
if now >= datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0):
pass
else:
continue
today_time = str(now.strftime("%Y-%m-%d 00:00:00"))
print(today_time)
result = get_failed_list(today_time)
result_null = get_null_list(today_time)
if result:
fail_list = get_info(result)
result_df = pd.DataFrame(fail_list)
result_df.to_excel(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", index=False)
sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号采集失败列表")
sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "mr@ciglobal.cn", "微信公众号采集失败列表")
else:
log.info(f'{today_time} 没有采集失败的文章')
if result_null:
fail_list = get_info(result)
null_list = get_info(result_null)
null_df = pd.DataFrame(fail_list)
null_df.to_excel(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", index=False)
sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号文章内容为空列表")
sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "mr@ciglobal.cn", "微信公众号文章内容为空列表")
else:
log.info(f'{today_time} 没有采集到空的文章')
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
''' '''
成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700 成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700 发布内容不存在800 图片处理失败300、600
''' '''
import re import re
...@@ -118,6 +118,10 @@ def get_info(dict_json, linkid): ...@@ -118,6 +118,10 @@ def get_info(dict_json, linkid):
# updatewxLink(url_news, info_source_code, 400) # updatewxLink(url_news, info_source_code, 400)
return False return False
soup_news = BeautifulSoup(res_news.content, 'html.parser') soup_news = BeautifulSoup(res_news.content, 'html.parser')
if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
log.info(f'--errorCode:800--{origin}---{news_date}---{news_title}----内容无法查看')
updatewxLink(url_news, info_source_code, 800)
return False
try: try:
news_html = soup_news.find('div', {'id': 'js_content'}) news_html = soup_news.find('div', {'id': 'js_content'})
news_html['style'] = 'width: 814px ; margin: 0 auto;' news_html['style'] = 'width: 814px ; margin: 0 auto;'
...@@ -228,7 +232,7 @@ def get_info(dict_json, linkid): ...@@ -228,7 +232,7 @@ def get_info(dict_json, linkid):
} }
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 7, 0))
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_time_out = kafka_result.get(timeout=10) kafka_time_out = kafka_result.get(timeout=10)
# add_url(sid, url_news) # add_url(sid, url_news)
...@@ -252,7 +256,7 @@ def get_info(dict_json, linkid): ...@@ -252,7 +256,7 @@ def get_info(dict_json, linkid):
} }
for nnn2 in range(0, 3): for nnn2 in range(0, 3):
try: try:
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2,7,0))
kafka_result2 = producer2.send("collectionAndDispatcherInfo", kafka_result2 = producer2.send("collectionAndDispatcherInfo",
json.dumps(dic_info2, ensure_ascii=False).encode('utf8')) json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
break break
......
import time import re
import pandas as pd import requests
from bs4 import BeautifulSoup
# def writeaa(): from retry import retry
# detailList=[]
# aa={
# 'id':3,
# 'name':'qqqwe'
# }
# detailList.append(aa)
# writerToExcel(detailList)
# 将数据追加到excel
# def writerToExcel(detailList):
# # filename='baidu搜索.xlsx'
# # 读取已存在的xlsx文件
# existing_data = pd.read_excel(filename,engine='openpyxl')
# # 创建新的数据
# new_data = pd.DataFrame(data=detailList)
# # 将新数据添加到现有数据的末尾
# combined_data = existing_data.append(new_data, ignore_index=True)
# # 将结果写入到xlsx文件
# combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
# gpdm = '01109.HK'
# if 'HK' in str(gpdm):
# tmp_g = str(gpdm).split('.')[0]
# if len(tmp_g) == 5:
# gpdm = str(gpdm)[1:]
# print(gpdm)
# else:
# pass
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
r = baseCore.r @retry(tries=3,delay=2)
def getrequest(url_news):
# #自增并设置过期时间
# while True: ip = baseCore.get_proxy()
# key = 'mykey' res_news = requests.get(url_news, proxies=ip, timeout=20)
# expiration_time = 60 # 设置过期时间 60秒 if res_news.status_code != 200:
# #设置自增 raise
# r.incr(key) return res_news
#
#
# def rm_style_attr(soup):
# value = int(r.get(key).decode()) # 查找所有含有style属性的标签
# style_tags = soup.find_all(style=True)
# if value > 10: # 遍历每个style标签
# print(value) for style_tag in style_tags:
# # 设置过期时间 try:
# r.expire(key, expiration_time) # 使用正则表达式替换
# styleattr = style_tag['style']
# time.sleep(70) styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
# print('------------------') styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
# continue styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
# # print(value) styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
# style_tag['style'] = styleattr
# print("==========") except:
# expiration_time = 60 continue
# # 创建PubSub对象
# p = r.pubsub() # first_div = soup.select('div[id="js_content"]')
# # # 设置style属性
# # 订阅过期事件 # first_div['style'] = 'width: 814px ; margin: 0 auto;'
# p.psubscribe('__keyevent@6__:expired')
# aa = p.listen() first_div = soup.select('div[id="js_content"]')
# # 监听过期事件 if first_div:
# for message in p.listen(): first_div = first_div[0] # 获取第一个匹配的元素
# if message['type'] == 'pmessage': first_div['style'] = 'width: 814px ; margin: 0 auto;' # 设置style属性
# expired_key = message['data'].decode()
# print('过期的key:', expired_key) return soup
# if expired_key == 'counter':
# # 执行重置操作
# r.set('counter', 0) if __name__ == "__main__":
# print('计数器已重置为0') # url_news = "http://mp.weixin.qq.com/s?__biz=MjM5NDMxOTMwNg==&mid=2653175413&idx=1&sn=8c0853ddab6e27799c4452e0b6e63156&chksm=bd5900d08a2e89c698de51f102b7423b33a27522966ca2218ca1b8ef290837b0087173c74bcb#rd"
# # 设置自增 url_news = "http://mp.weixin.qq.com/s?__biz=MzU4ODQwNTIxMw==&mid=2247528290&idx=4&sn=370655b44dfd31b99984e2eeeb4868e0&chksm=fddf6fd0caa8e6c63a0b5e4fece250415fcb56f03f305450b1434978769b443eaa416342326e#rd"
# r.incr('counter') # 修改请求方法,retry 3次
# # 设置过期时间 try:
# r.expire('counter', expiration_time) res_news = getrequest(url_news)
# print(res_news)
for i in range(0, 24, 5): except:
print(i) try:
\ No newline at end of file res_news = requests.get(url_news, timeout=20)
print('请求成功')
except:
res_news = None
pass
soup_news = BeautifulSoup(res_news.content, 'html.parser')
if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
print('失败')
try:
news_html = soup_news.find('div', {'id': 'js_content'})
news_html['style'] = 'width: 814px ; margin: 0 auto;'
#del news_html['style']
news_html = rm_style_attr(news_html)
# print(news_html)
del news_html['id']
del news_html['class']
except Exception as e:
print(e)
news_html = None
# print(news_html)
news_content = news_html.text
list_img = news_html.find_all('img')
for num_img in range(len(list_img)):
img_one = list_img[num_img]
url_src = img_one.get('data-src')
# print(url_src)
if url_src and 'gif' in url_src:
url_img = ''
img_one.extract()
else:
try:
try:
name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
except:
img_one.extract()
continue
try:
res = requests.get(url_src, timeout=20)
except:
img_one.extract()
continue
except Exception as e:
print(f'--error--{url_news}-----------{e}')
for tag in news_html.descendants:
try:
del tag['style']
except:
pass
list_section = news_html.find_all('section')
for section in list_section:
section.name = 'div'
print(news_html)
\ No newline at end of file
...@@ -37,7 +37,7 @@ element.getparent() #获取给定元素的父元素 ...@@ -37,7 +37,7 @@ element.getparent() #获取给定元素的父元素
# print(res) # print(res)
"""测试中国执行信息公开网 模拟浏览器""" """测试中国执行信息公开网 模拟浏览器"""
import ddddocr # import ddddocr
from PIL import Image from PIL import Image
import re import re
...@@ -144,34 +144,34 @@ import json ...@@ -144,34 +144,34 @@ import json
import requests import requests
import pymongo import pymongo
from base import BaseCore from base import BaseCore
baseCore = BaseCore.BaseCore() # baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() # log = baseCore.getLogger()
#
#
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[ # db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0504'] # '数据源_0504']
#
datas = db_storage.find({'postCode':'2'}).limit(1) # datas = db_storage.find({'postCode':'2'}).limit(1)
for data in datas: # for data in datas:
title = data['titleForeign'] # title = data['titleForeign']
contentWithTag = data['richTextForeign'] # contentWithTag = data['richTextForeign']
summary = data['contentForeign'] # summary = data['contentForeign']
dic_info = { # dic_info = {
'title':title, # 'title':title,
# 'summary':summary, # # 'summary':summary,
'contentWithTag':contentWithTag # 'contentWithTag':contentWithTag
} # }
headers = { # headers = {
'Content-Type': 'application/json', # 'Content-Type': 'application/json',
} # }
dic_info_ = json.dumps(dic_info) # dic_info_ = json.dumps(dic_info)
# print(dic_info_) # # print(dic_info_)
# with open('./data.json','w') as f: # # with open('./data.json','w') as f:
# f.write(dic_info_) # # f.write(dic_info_)
# break # # break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers) # # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers) # req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text) # log.info(req.text)
# import re, datetime # import re, datetime
# #
...@@ -237,4 +237,228 @@ for data in datas: ...@@ -237,4 +237,228 @@ for data in datas:
# if __name__ == "__main__": # if __name__ == "__main__":
# publishtime_ = '1小时17分钟前' # publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d") # publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time) # print(publish_time)
\ No newline at end of file
# import pandas as pd
#
# # 创建一个示例DataFrame
# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
#
# # 要追加的行
# new_rows = pd.DataFrame({'A': [4, 5], 'B': [7, 8]})
#
# # 追加行到原DataFrame
# df = pd.concat([df, new_rows], ignore_index=True)
#
# print(df)
# import pandas as pd
#
# # 假设我们有两个DataFrame
# df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# df2 = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
#
# # 从df1中取出第一行数据
# row_to_append = df1.iloc[0].to_frame().T
# # print(row_to_append)
#
# # 将这一行数据追加到df2中
# # 注意:这里使用ignore_index=True来忽略索引,并重新设置索引
# # df2 = df2.append(row_to_append, ignore_index=True)
# df2 = pd.concat([df2, row_to_append], ignore_index=True)
#
# # 打印结果
# print(df2)
# import openpyxl
# import redis
#
# # 连接到Redis服务器
# redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
#
# # 打开Excel文件
# workbook = openpyxl.load_workbook('D:\\kkwork\\企业数据\\年报数据任务.xlsx')
#
# # 选择要读取的工作表
# worksheet = workbook['系统批量导出年报名单']
#
# # 选择要读取的列
# column_index = 0 # 选择第2列
#
# # 遍历指定列的单元格,并将值放入Redis列表
# for row in worksheet.iter_rows(values_only=True):
# try:
# cell_value = row[1] + "|" + row[2]
# except:
# print(row[1])
# continue
# # cell_value = row[1]
# redis_client.rpush('NianBao:socialcode', cell_value)
#
# # 关闭Excel文件
# workbook.close()
# def find_empty_id_sequences(id_list):
# empty_id_count = 0
# empty_id_sequences = []
# current_sequence = []
#
# for id in id_list:
# if id is None: # 假设 id 为 None 表示空
# empty_id_count += 1
# current_sequence.append(empty_id_count)
# else:
# if empty_id_count > 0:
# empty_id_sequences.append(current_sequence)
# current_sequence = []
# empty_id_count = 0
#
# if empty_id_count > 0:
# empty_id_sequences.append(current_sequence)
#
# return empty_id_sequences
#
# # 示例使用
# id_list = [1, None, 2, None, None, 3, None, None, None, 4]
# empty_id_sequences = find_empty_id_sequences(id_list)
# print(empty_id_sequences)
# def process_results(results):
# # 初始化输出字符串和上一个结果
# final_output = ""
# previous_res = None
#
# # 初始化连续空的结果索引列表
# empty_indices = []
# empty_id_sequences = []
# all_result = []
# # 对结果进行排序
# sorted_results = sorted(results.items(), key=lambda x: int(x[0].split(".")[0]))
#
# # 遍历排序后的结果
# for index, (_, res) in enumerate(sorted_results):
# if res == '':
# # 如果是空结果,将索引添加到空结果索引列表中
# empty_indices.append(_)
#
# else:
# # 如果结果非空,并且连续空的结果列表不为空,
# # 则添加连续空的结果索引列表到输出中
# if empty_indices:
# # final_output += " ".join(map(str, empty_indices)) + '\n'
# empty_id_sequences.append(empty_indices)
# all_result.append(empty_indices)
# empty_indices = []
# # 如果是非空结果并且与上一个结果不同,则添加到输出中
# if res != previous_res:
# final_output += res + '\n'
# all_result.append(res)
# previous_res = res
#
# return all_result, empty_id_sequences
#
# # 示例使用
# results = {
# "1.0": "",
# "2.0": "",
# "3.0": "result1",
# "4.0": "",
# "5.0": "",
# "6.0": "result2",
# "7.0": "result2",
# "8.0": "",
# "9.0": "result3"
# }
#
# all_result, empty_id_sequences, dic_index = process_results(results)
# print(all_result)
# print(empty_id_sequences)
# print(dic_index)
import Levenshtein
def same_rule(same_list):
# 记录最长的一个
max_len = 0
for item in same_list:
if len(item) > max_len:
max_len = len(item)
char_map_list = []
char_map_score = []
for i in range(max_len):
char_map_list.append(dict())
char_map_score.append(dict())
for index in range(max_len):
for i in range(len(same_list)):
if index < len(same_list[i]):
char = same_list[i][index]
score = same_list[i][index]
else:
char = ""
score = 1
if char not in char_map_list[index]:
char_map_list[index][char] = 1
else:
char_map_list[index][char] += 1
if char not in char_map_score[index]:
char_map_score[index][char] = score
else:
char_map_score[index][char] += score
print(char_map_list)
print(char_map_score)
#返回个数多的;当个数相同时,返回打分高的
result = []
for i in range(max_len):
print('---------------')
print(char_map_list[i].items())
print(sorted(char_map_list[i].items(), key=lambda item:item[1], reverse=True))
result.append(sorted(char_map_list[i].items(), key=lambda item:item[1],reverse=True)[0][0])
return "".join(result)
def aaaaa(final_output):
finall_list = []
same_list = []
# 处理相似的
for result in final_output:
print(f"result:{result}")
if len(same_list) > 0:
ratio = Levenshtein.ratio(result, same_list[-1])
if ratio < 0.5:
# 差异大于0.5
# 对相似的做处理
if len(same_list) > 1:
result_ = max(same_list, key=lambda x: (len(x), x))
same_list = [result_]
finall_list[-1] = result_
finall_list.append(result)
else:
same_list = [result]
finall_list.append(result)
else:
same_list.append(result)
else:
same_list.append(result)
finall_list.append(result)
print(finall_list)
if __name__ == '__main__':
same_list = ['让我们从一次时光旅行', '开启植物天堂的故事', '地球的午夜', '是在火山喷发中度过的', '到了凌晨三四点', '在海洋深处有了生命的迹象', '清晨6点多', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '种蓝藻细菌', '一种蓝藻细菌', '学会利用二氧化碳水和阳光', '制造生命所需能量', '同时释放出了氧气', '这个被称为光合作用的过程', '为植物世界打开了大门', '此时', '中国的陆地', '也逐渐从海洋露出形成岛屿', '但在相当长的时间里', '陆地十分荒凉没有生机', '这些岩石坚硬', '无法储存水分', '是当时陆地环境的写照', '直到晚上九点多', '也就是四亿年前左右', '些矮小的生命', '开始征服陆地', '她们用一种近似于根的构造', '固定在岩石上', '苔藓', '是陆地最早的拓荒者之', '小', '她们死后的身体', '形成了肥沃的土壤', '让更多的植物可以在这里生存', '从此', '绿色成为植物天堂的底色']
# aaa = aaaaa(same_list)
#
# for i in range(len(same_list)):
# print(i, same_list[i])
#
# -*- coding: utf-8 -*-
import datetime
import json
import re
import time import time
import pandas as pd
import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
import urllib3
from selenium.webdriver.support.wait import WebDriverWait
db_storageInsert = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').jixie[
'企业基本信息']
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from dateutil.relativedelta import relativedelta
import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
# sys.path.append('D:\\kkwork\\zzsn_spider\\base')
# import BaseCore
from base import BaseCore
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
from classtool import Token, File, Tag
token = Token()
file = File()
tag = Tag()
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
# 检查登陆状态
def checklogin(key):
t = int(time.time())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
driver.get(url)
time.sleep(2)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
return soup
# 采集准备
def redaytowork(com_name):
log.info(f'----当前企业{com_name}--开始处理---')
count = 0
soup = checklogin(com_name)
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
token.updateTokeen(id_cookie, 2)
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
return count
else:
try:
searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span',
class_='index_title-count__lDSjB').text
except:
try:
# todo:可能是搜不到该企业
errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
if '抱歉' in errormessage:
log.info('=====搜索不到该企业====')
return count
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName_jixie', com_name)
token.updateTokeen(id_cookie, 2)
# log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
else:
# 开始采集
try:
if spiderwork(soup, com_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie, 3)
return count
else:
return count
except Exception as e:
log.info(f'====={com_name}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
token.updateTokeen(id_cookie, 2)
log.info('=====已重新放入redis,cookies已封号======')
return count
def ifbeforename(company_url):
driver.get(company_url)
time.sleep(2)
com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
businessinfo = ''
if businessinfo:
try:
name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
value = \
businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ',
'').replace(
'…', '').replace('\n', '').replace('复制', '').split('(')[0]
except:
name = '曾用名'
value = ''
return value
else:
return ''
# 采集基本信息和工商信息
def spiderinfo(company_url):
qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====')
driver.get(company_url)
page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
script = com_soup.find('script', attrs={'id': '__NEXT_DATA__'}).text
script = json.loads(script)
script = script['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']
companyName = script['name']
updateTime = int(script['updateTimes'])
updateTime = datetime.datetime.fromtimestamp(updateTime / 1000).strftime('%Y-%m-%d %H:%M:%S')
creditCode = script['creditCode']
operName = script['legalPersonName']
phoneNumber = script['phoneNumber']
webSite = script['websiteList']
try:
email = script['emailList'][0]
except:
email = None
desc = script['baseInfo']
status = script['regStatus']
startDate = int(script['estiblishTime'])
startDate = datetime.datetime.fromtimestamp(startDate / 1000).strftime('%Y-%m-%d %H:%M:%S')
registCapi = script['regCapital']
recCap = script['actualCapital']
checkDate = int(script['approvedTime'])
checkDate = datetime.datetime.fromtimestamp(checkDate / 1000).strftime('%Y-%m-%d %H:%M:%S')
orgNo = script['orgNumber']
No = script['regNumber']
taxpayerNo = script['taxNumber']
econKind = script['companyOrgType']
termStart = int(script['fromTime'])
termStart = datetime.datetime.fromtimestamp(termStart / 1000).strftime('%Y-%m-%d %H:%M:%S')
termEnd = script['toTime']
termEnd = datetime.datetime.fromtimestamp(termEnd / 1000).strftime('%Y-%m-%d %H:%M:%S')
taxpayerType = script['taxQualification']
subIndustry = script['industryInfo']['nameLevel3']
belogOrg = script['regInstitute']
info = script['staffNumRange']
canbao = script['socialStaffNum']
try:
originalName = script['historyNames']
originalName = originalName.split('\n')[0]
except:
originalName = None
englishName = script['property3']
address = script['taxAddress']
scope = script['businessScope']
aa_dic = {
'name': companyName, # 企业名称
'shortName': None, # 企业简称
'socialCreditCode': creditCode, # 统一社会信用代码
'legalPerson': operName, # 法定代表人
'officialPhone': phoneNumber, # 电话
'officialUrl': webSite, # 官网
'officialEmail': email, # 邮箱
'briefInfo': desc, # 简介
'registerStatus': status, # 登记状态
'incorporationDate': startDate, # 成立日期
'capital': registCapi, # 注册资本
'paidCapital': recCap, # 实缴资本
'approvalDate': checkDate, # 核准日期
'organizationCode': orgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': taxpayerNo, # 纳税人识别号
'type': econKind, # 企业类型
'businessStartDate': termStart, # 营业期限自
'businessEndDate': termEnd, # 营业期限至
'taxpayerQualification': taxpayerType, # 纳税人资质
'industry': subIndustry, # 所属行业
'region': None,
'province': None, # 所属省
'city': None, # 所属市
'county': None, # 所属县
'registerDepartment': belogOrg, # 登记机关
'scale': info, # 人员规模
'insured': canbao, # 参保人数
'beforeName': originalName, # 曾用名
'englishName': englishName, # 英文名
'importExportEnterpriseCode': None, # 进出口企业代码
'address': address, # 地址
'businessRange': scope, # 经营范围
'status': 0, # 状态
'sourceUpdateTime': updateTime, # 更新时间
'qccId': qccid,
'ynDomestic': '',
'countryName': '',
'securitiesCode': '',
'securitiesShortName': '',
'listingDate': '',
'category': '',
'exchange': '',
'listingType': '',
}
for key, value in aa_dic.items():
if value == 'None':
aa_dic[key] = None
db_storageInsert.insert_one(aa_dic)
def remove_parentheses(text):
# 清除中文小括号
text = re.sub(r'(|)', '', text)
# 清除英文小括号
text = re.sub(r'\(|\)', '', text)
return text.replace(' ', '')
# 判断名称是否统一
def spiderwork(soup, receptname):
company_url = ''
try:
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={com_name}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
token.updateTokeen(id_cookie, 2)
log.info('=====已重新放入redis,cookies已封号======')
return False
# receptname = '小米通讯技术有限公司'
for compamy in company_list:
info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname:
company_url = info_t.find('a')['href']
break
elif not receptname:
company_url = info_t.find('a')['href']
break
else:
jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
if remove_parentheses(receptname) == jian_name:
log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
company_url = info_t.find('a')['href']
break
else:
continue
if company_url:
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url)
else:
# 判断是否是曾用名
getname = ''
for child in company_list[0].find_all():
if child.has_attr('class'):
print(child['class'])
if 'index_name' in child['class'][0]:
getname = child.text
company_url = child.find('a')['href']
break
if getname:
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url)
else:
# 没有搜到相同的企业名称
log.info('没有搜索到相同企业名称')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie_no',com_name)
return False
else:
# 没有搜到相同的企业名称
log.info('没有搜索到相同企业名称')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie_no',com_name)
return False
return True
baseCore = BaseCore() def login():
log =baseCore.getLogger() # time.sleep(10)
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(5)
return driver, id_cookie
if __name__ == '__main__': if __name__ == '__main__':
path = r'F:\spider\115\chromedriver.exe' taskType = '基本信息/天眼查'
driver = baseCore.buildDriver(path,headless=False) driver = create_driver()
# service = Service(r'F:\spider\115\chromedriver.exe') url = 'https://www.tianyancha.com/'
# chrome_options = webdriver.ChromeOptions() driver.get(url)
# # chrome_options.add_argument('--headless') driver.maximize_window()
# # chrome_options.add_argument('--disable-gpu') while True:
# chrome_options.add_experimental_option( driver, id_cookie = login()
# "excludeSwitches", ["enable-automation"]) nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
# chrome_options.add_experimental_option('useAutomationExtension', False) headers = {
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent()) 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service) 'Cache-Control': 'max-age=0',
# with open('stealth.min.js') as f: 'Connection': 'keep-alive',
# js = f.read() 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'Cookie': 'TYCID=6f6298905d3011ee96146793e725899d; ssuid=3467188160; _ga=GA1.2.1049062268.1697190322; HWWAFSESID=2eb035742bde209aa60; HWWAFSESTIME=1706586308439; csrfToken=bT_looAjInHGeAnvjjl12L9v; bannerFlag=true; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=0; tyc-user-phone=%255B%252216603863075%2522%252C%2522152%25203756%25200528%2522%252C%2522159%25200367%25203315%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22310689501%22%2C%22first_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThhZDY5NmEyZWY2ODAtMGFlNWNkOTI5M2ExNTM4LTI2MDMxZjUxLTkyMTYwMC0xOGFkNjk2YTJmMGRjNSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMxMDY4OTUwMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22310689501%22%7D%2C%22%24device_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%7D; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2218703752600%22%2C%22userId%22%3A%22310689501%22%7D; tyc-user-info-save-time=1707008605562; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwNzAwODYwNSwiZXhwIjoxNzA5NjAwNjA1fQ.i8WEUrXjG2X__SnGGlnjwNXyOEdXlslrnvzvKZ_xlVA0rdjdsYHdaieAzkmIjoKbuv6Lc4Eqpb70hWIlq2zeoQ; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1705286979,1706586312; searchSessionId=1707118324.99879267;'
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { }
# "source": js start_time = time.time()
# }) # 获取企业信息
gpdm = '9021.T' com_name = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode_jixie')
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}" if com_name == '' or com_name is None:
driver.get(url) flag = True
\ No newline at end of file while flag:
log.info('--------已没有数据---------')
time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
com_name = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode_jixie')
if com_name:
flag = False
log.info("-----已添加数据------")
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
continue
continue
count = redaytowork(com_name)
time.sleep(10)
break
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论