提交 a38c9372 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

...@@ -464,7 +464,8 @@ def zhengquanqihuo(): ...@@ -464,7 +464,8 @@ def zhengquanqihuo():
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse(): def sse():
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761' # url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_=1703469889542'
headers = { headers = {
'Accept': '*/*', 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate', 'Accept-Encoding': 'gzip, deflate',
...@@ -485,9 +486,13 @@ def sse(): ...@@ -485,9 +486,13 @@ def sse():
# os.makedirs(path) # os.makedirs(path)
for page in range(0, int(total_page)): for page in range(0, int(total_page)):
t = int(time.time()) t = int(time.time())
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=24278800487459370386559742313666&_={t}' url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_={t}'
data = policy.getrequest_json(headers, url_page) data = policy.getrequest_json(headers, url_page)
newslist = data['data']['knowledgeList'] newslist = data['data']['knowledgeList']
# if newslist:
# pass
# else:
# continue
# print(newslist) # print(newslist)
for news in newslist: for news in newslist:
num += 1 num += 1
...@@ -521,8 +526,8 @@ def sse(): ...@@ -521,8 +526,8 @@ def sse():
content = '' content = ''
response = requests.get(newsUrl, timeout=20) response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages(): for page_ in doc.pages():
content += page.get_text() content += page_.get_text()
file_href = newsUrl file_href = newsUrl
file_name = title file_name = title
...@@ -628,7 +633,7 @@ def sse(): ...@@ -628,7 +633,7 @@ def sse():
for att_id in id_list: for att_id in id_list:
baseCore.deliteATT(att_id) baseCore.deliteATT(att_id)
except Exception as e: except Exception as e:
log.info(f"error!!!{newsUrl}") log.info(f"error!!!{newsUrl}===={title}")
log.info(e) log.info(e)
log.info(f'====第{page}页====处理结束,================') log.info(f'====第{page}页====处理结束,================')
...@@ -972,14 +977,14 @@ def guizhou(): ...@@ -972,14 +977,14 @@ def guizhou():
if __name__=="__main__": if __name__=="__main__":
# file_path = f'data/REITs贵州省人民政府.xlsx' # file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path) # wb = policy.createfile(file_path)
reform() # reform()
# shenzhen() # # shenzhen()
zhengquanqihuo() # zhengquanqihuo()
try: try:
sse() sse()
except: except:
pass pass
hebei() # hebei()
guizhou() # guizhou()
# zhengquanqihuo() # zhengquanqihuo()
\ No newline at end of file
...@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen ...@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__": if __name__ == "__main__":
beijing() beijing()
reits.sse() reits.sse()
reits.reform() reits.reform()
......
...@@ -403,6 +403,7 @@ class BaseCore: ...@@ -403,6 +403,7 @@ class BaseCore:
sql = "select proxy from clb_proxy" sql = "select proxy from clb_proxy"
self.cursor.execute(sql) self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall() proxy_lists = self.cursor.fetchall()
self.cnx.commit()
ip_list = [] ip_list = []
for proxy_ in proxy_lists: for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", '')) ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
...@@ -472,6 +473,10 @@ class BaseCore: ...@@ -472,6 +473,10 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素 # 从Redis的List中获取并移除一个元素
def redicPullData(self, key): def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = self.r.lpop(key) item = self.r.lpop(key)
return item.decode() if item else None return item.decode() if item else None
...@@ -658,6 +663,8 @@ class BaseCore: ...@@ -658,6 +663,8 @@ class BaseCore:
return 'cn' return 'cn'
if result[0] == '': if result[0] == '':
return 'cn' return 'cn'
if result[0] == 'ja':
return 'jp'
return result[0] return result[0]
#创建excel文件 #创建excel文件
...@@ -685,6 +692,10 @@ class BaseCore: ...@@ -685,6 +692,10 @@ class BaseCore:
# 对失败或者断掉的企业 重新放入redis # 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item): def rePutIntoR(self, key, item):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r.rpush(key, item) self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值 # 增加计数器的值并返回增加后的值
......
...@@ -674,7 +674,7 @@ if __name__ == "__main__": ...@@ -674,7 +674,7 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
# MengZhi() # MengZhi()
# NQEnterprise() NQEnterprise()
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
...@@ -683,6 +683,6 @@ if __name__ == "__main__": ...@@ -683,6 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task() # AnnualEnterprise_task()
# FinanceFromEast() # FinanceFromEast()
# ipo_code() # ipo_code()
JingyingfenxiFromEase() # JingyingfenxiFromEase()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===') log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
...@@ -292,7 +292,7 @@ def dic_handle(result_dic): ...@@ -292,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict return aa_dict
# 采集准备 # 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code: # if social_code:
# dic_info = baseCore.getInfomation(social_code) # dic_info = baseCore.getInfomation(social_code)
...@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else: else:
# 开始采集 # 开始采集
try: try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1 count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}') log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3) token.updateTokeen(id_cookie,3)
...@@ -373,7 +373,7 @@ def ifbeforename(company_url): ...@@ -373,7 +373,7 @@ def ifbeforename(company_url):
return '' return ''
# 采集基本信息和工商信息 # 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0] qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新 # 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'" updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
...@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic) # print(aa_dic)
sendkafka(aa_dic) sendkafka(aa_dic)
...@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic) sendkafka(aa_dic)
# 判断名称是否统一 # 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = '' company_url = ''
try: try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list') company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html' # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html' # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息 # 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
# 判断是否是曾用名 # 判断是否是曾用名
tr = tr_list[:1][0] tr = tr_list[:1][0]
...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url) beforename = ifbeforename(company_url)
if beforename == receptname: if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
...@@ -549,6 +549,7 @@ if __name__ == '__main__': ...@@ -549,6 +549,7 @@ if __name__ == '__main__':
else: else:
log.info('==========已无cookies==========') log.info('==========已无cookies==========')
time.sleep(30) time.sleep(30)
continue continue
id_cookie = cookieinfo[0] id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1]) cookie_ = json.loads(cookieinfo[1])
...@@ -579,8 +580,8 @@ if __name__ == '__main__': ...@@ -579,8 +580,8 @@ if __name__ == '__main__':
} }
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '91220101606092819L||' company_field = '913300007125582210||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name) baseCore.sendEmail(file_name)
...@@ -595,6 +596,11 @@ if __name__ == '__main__': ...@@ -595,6 +596,11 @@ if __name__ == '__main__':
while flag: while flag:
log.info('--------已没有数据---------') log.info('--------已没有数据---------')
time.sleep(30) time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field: if company_field:
flag = False flag = False
...@@ -604,26 +610,28 @@ if __name__ == '__main__': ...@@ -604,26 +610,28 @@ if __name__ == '__main__':
continue continue
social_code = company_field.split('|')[0] social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '') com_name = company_field.split('|')[1].replace(' ', '')
ynDomestic = company_field.split('|')[15] # ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16] # countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17] # securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18] # securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21] # listingDate = company_field.split('|')[21]
category = company_field.split('|')[19] # category = company_field.split('|')[19]
exchange = company_field.split('|')[20] # exchange = company_field.split('|')[20]
# ynDomestic = '' # listType = company_field.split('|')[21]
# countryName = '' ynDomestic = '1'
# securitiesCode = '' countryName = '中国内地'
# securitiesShortName = '' securitiesCode = ''
# listingDate = '' securitiesShortName = ''
# category = '' listingDate = ''
# exchange = '' category = ''
exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name) listType = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(2) time.sleep(2)
# break break
# baseCore.r.close() # baseCore.r.close()
# baseCore.sendEmail(file_name) # baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
......
...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html' # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html' # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息 # 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
# 判断是否是曾用名 # 判断是否是曾用名
tr = tr_list[:1][0] tr = tr_list[:1][0]
...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url) beforename = ifbeforename(company_url)
if beforename == receptname: if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
......
import pandas as pd
# from pandas import DataFrame as df
import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df_all = pd.read_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', dtype=str)
list_com = []
for num_df in range(len(df_all)):
com_name = str(df_all['企业名称'][num_df])
dic_com = {
'social_code': '',
'com_name': com_name
}
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from sys_base_enterprise where name = %s '''
cursor.execute(sel_sql, com_name)
selects = cursor.fetchone()
if selects:
print(f'【{num_df}/{len(df_all)}】==={com_name}找到')
social_code = selects[0]
else:
print(f'【{num_df}/{len(df_all)}】==={com_name}未找到')
social_code = ''
df_all['信用代码'][num_df] = str(social_code)
df_all.to_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', index=False)
\ No newline at end of file
...@@ -28,7 +28,7 @@ headers = { ...@@ -28,7 +28,7 @@ headers = {
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site', 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg', 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d', 'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
......
...@@ -38,7 +38,7 @@ headers = { ...@@ -38,7 +38,7 @@ headers = {
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site', 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg', 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d', 'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
...@@ -70,7 +70,7 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -70,7 +70,7 @@ def beinWork(tyc_code, social_code,start_time):
pass pass
except Exception as e: except Exception as e:
#todo:重新放入redis中 #todo:重新放入redis中
baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode',social_code) baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code)
log.error(f"{tyc_code}-----获取总数接口失败") log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败' error = '获取总数接口失败'
state = 0 state = 0
...@@ -302,10 +302,11 @@ def doJob(): ...@@ -302,10 +302,11 @@ def doJob():
continue continue
id = data[0] id = data[0]
xydm = data[2] xydm = data[2]
com_name = data[1]
tycid = data[11] tycid = data[11]
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(xydm) retData = getTycIdByXYDM(com_name)
if retData['tycData'] and retData['reput']: if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id'] tycid = retData['tycData']['id']
# todo:写入数据库 # todo:写入数据库
......
...@@ -43,7 +43,7 @@ class EsMethod(object): ...@@ -43,7 +43,7 @@ class EsMethod(object):
"must": [ "must": [
{ {
"match": { "match": {
"type": "1" "type": "0"
} }
} }
] ]
...@@ -115,7 +115,7 @@ def main(page, p, esMethod): ...@@ -115,7 +115,7 @@ def main(page, p, esMethod):
attid = mms['_source']['attachmentIds'][0] attid = mms['_source']['attachmentIds'][0]
log.info(f'{id}-{attid}--{title}--{sourceAddress}---') log.info(f'{id}-{attid}--{title}--{sourceAddress}---')
selects = secrchATT('1', attid) selects = secrchATT('4', attid)
if selects: if selects:
pass pass
else: else:
......
...@@ -228,7 +228,7 @@ def download(data, order_by): ...@@ -228,7 +228,7 @@ def download(data, order_by):
'sid': sid, 'sid': sid,
'sourceAddress': sourceAddress, 'sourceAddress': sourceAddress,
'summary': summary, 'summary': summary,
'title': name_pdf, 'title': name_pdf.split('.pdf')[0],
'type': '0' 'type': '0'
} }
# 将相应字段通过kafka传输保存 # 将相应字段通过kafka传输保存
...@@ -257,11 +257,11 @@ def download(data, order_by): ...@@ -257,11 +257,11 @@ def download(data, order_by):
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
delete_url(sourceAddress) delete_url(sourceAddress)
# 获取当前进程pid # # 获取当前进程pid
current_pid = baseCore.getPID() # current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程 # # todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv) # subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9) # os.kill(current_pid, 9)
return return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size'] page_size = retData['page_size']
...@@ -328,37 +328,152 @@ def download(data, order_by): ...@@ -328,37 +328,152 @@ def download(data, order_by):
log.info(dic_result) log.info(dic_result)
return return
# def Mob():
# url = 'https://www.mob.com/mobData/report'
# res = requests.get(url=url,headers=headers).content
# soup = BeautifulSoup(res,'html.parser')
# max_info = soup.find('span',class_='el-pagination__total').text
# max_info = re.findall('\d{1,4}',max_info)[0]
# # print(type(max_info))
# max_page = int((int(max_info)/9) + 1)
# print(max_page)
# i_id = 0
# for page in range(max_page):
# url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
# res = requests.get(url=url, headers=headers).content
# soup = BeautifulSoup(res, 'html.parser')
# result = soup.find('ul', class_='fix')
# li_list = result.find_all('li')
# # for id in range(1, 149):
# id = i_id
# for li in li_list:
# id += 1
# title = li.find('div',class_='title').text
# time = li.find('div',class_='date tc').text.strip()
# year = re.findall('\d{4}',time)[0]
# # for id in range(29,178):
# real_id = 178 - id
# href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# # href = 'https://www.mob.com/mobdata/report/169'
# res_href = requests.get(url=href,headers=headers).content
# i_soup = BeautifulSoup(res_href,'html.parser')
# url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
# summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
# fin_summary = []
# for s in summary_list:
# summary = s.text
# fin_summary.append(summary)
# summary = ''.join(fin_summary)
# dic_post = {
# 'title': title, # 报告名称
# 'url_pdf': url_pdf, # 报告链接
# 'year': year, # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': time, # 时间
# 'origin': 'Mob研究院', # 来源
# 'sourceAddress': href, # 原文链接
# 'content': '', # 内容
# 'summary': summary, # 摘要
# 'sid': '1662008807781212161', # 信息源id
# }
# order_by = 1
# download(dic_post,order_by)
# order_by += 1
# # print(dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# i_id += 9
def Mob(): def Mob():
url = 'https://www.mob.com/mobData/report' # loginfo = baseCore.redicPullData('Mob:loginfo')
res = requests.get(url=url,headers=headers).content # account = loginfo.split('|')[0]
soup = BeautifulSoup(res,'html.parser') # password = loginfo.split('|')[1]
max_info = soup.find('span',class_='el-pagination__total').text # usecount = loginfo.split('|')[2]
max_info = re.findall('\d{1,4}',max_info)[0] usecount = 0
# print(type(max_info)) # 测试用
max_page = int((int(max_info)/9) + 1) account = '13636711746'
print(max_page) password = 'Zhenghao123'
i_id = 0
for page in range(max_page): # account = '18703752600'
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1) # password = 'Axlk010208!'
res = requests.get(url=url, headers=headers).content # account = '13273737131'
soup = BeautifulSoup(res, 'html.parser') # password = 'liu1230...'
result = soup.find('ul', class_='fix') # account = '15237560528'
li_list = result.find_all('li') # password = 'xlk123456!'
# for id in range(1, 149): # account = '17103126138'
id = i_id # password = '171BlackOne'
for li in li_list: # account = '17103128590'
id += 1 # password = '171BlackTwo'
title = li.find('div',class_='title').text browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
time = li.find('div',class_='date tc').text.strip() f_url = 'https://www.mob.com/developer/login'
year = re.findall('\d{4}',time)[0] browser.get(f_url)
# for id in range(29,178): browser.find_element(By.CLASS_NAME, 's1').click()
real_id = 178 - id browser.find_element(By.CSS_SELECTOR, 'input[type="text"]').send_keys(f'{account}')
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id) browser.find_element(By.CSS_SELECTOR, 'input[type="password"]').send_keys(f'{password}')
# href = 'https://www.mob.com/mobdata/report/169' browser.find_element(By.XPATH, '//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span').click()
res_href = requests.get(url=href,headers=headers).content if usecount < 5:
pass
else:
return Mob()
# 获取登录的信息
# url = browser.current_url
# print(url)
url = 'https://www.mob.com/mobdata/report'
browser.get(url)
# tags = browser.find_elements(By.CLASS_NAME, 'main-title')
# for tag in tags:
# if 'Mob研究院' in tag.text:
# tag.click()
# else:
# continue
# # try:
# # web = tag.find_element(By.CLASS_NAME, "")
# # web.click()
# # break
# # except:
# # continue
cookies_list = browser.get_cookies()
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
# cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
# cookies_ = json.dumps(cookies)
session = requests.session()
session.cookies.update(cookies)
for i in range(5):
url = f'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
req = session.get(url=url, headers=headers)
data_json = req.json()
news_list = data_json['data']['list']
for info in news_list:
title = info['title']
publishDate = info['effective_date']
year = publishDate[:4]
report_id = info['report_id']
href = 'https://www.mob.com/mobdata/report/{}'.format(report_id)
# tf_url = add_check_url(href)
is_member = r.sismember('report_pdf_three_history', href)
if is_member:
continue
res_href = session.get(url=href, headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser') i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3') summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
headers['token'] = '05bc441a-b09b-40cb-ab65-8d9e63e5c529'
news_req = session.get(url=news_url,headers=headers)
pdf_url = news_req.json()['data']
fin_summary = [] fin_summary = []
for s in summary_list: for s in summary_list:
summary = s.text summary = s.text
...@@ -366,13 +481,13 @@ def Mob(): ...@@ -366,13 +481,13 @@ def Mob():
summary = ''.join(fin_summary) summary = ''.join(fin_summary)
dic_post = { dic_post = {
'title': title, # 报告名称 'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接 'url_pdf': pdf_url, # 报告链接
'year': year, # 报告年份 'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4) 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf 'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间 'publishDate': publishDate, # 时间
'origin': 'Mob研究院', # 来源 'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接 'sourceAddress': href, # 原文链接
'content': '', # 内容 'content': '', # 内容
...@@ -382,12 +497,7 @@ def Mob(): ...@@ -382,12 +497,7 @@ def Mob():
order_by = 1 order_by = 1
download(dic_post,order_by) download(dic_post,order_by)
order_by += 1 order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai(): def yidong_guanxiangtai():
...@@ -452,85 +562,202 @@ def yidong_guanxiangtai(): ...@@ -452,85 +562,202 @@ def yidong_guanxiangtai():
# print(res.json()) # print(res.json())
# 巨量算数 # # 巨量算数
def juliangsuanshu(): # def juliangsuanshu():
browser = webdriver.Chrome(chromedriver) # # browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
#
# url = 'https://trendinsight.oceanengine.com/arithmetic-report'
# browser.get(url)#跳到指定页面
#
# page_source = browser.page_source#获取页面信息
# soup = BeautifulSoup(page_source, 'html.parser')
#
# list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
# for one_info in list_all:
# info_title = one_info.a.text.strip()
# info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
# info_href = one_info.a.get('href')
# info_url = 'https://trendinsight.oceanengine.com'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
# list_script = soup_info.find_all('script')
# for script in list_script:
# if 'window._SSR_DATA' in script.text:
# json_str = script.text
# info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
#
# info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
# info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '巨量算数', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': '', # 内容
# 'summary': info_zhaiyao, # 摘要
# 'sid': '1662008524476948481', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# browser.quit()
url = 'https://trendinsight.oceanengine.com/arithmetic-report' # 巨量算数
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息 def getnews(browser):
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'}) list_all = soup.find('div', {'class': 'byted-loading byted-loading-block'}).find_all('div', {
'class': 'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'})
for one_info in list_all: for one_info in list_all:
info_title = one_info.a.text.strip() try:
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1] info_title = one_info.a.text.strip()
info_href = one_info.a.get('href') info_date = one_info.find('div', {'class': 'releaseTime-MbbUaH'}).text.split(':')[1]
info_url = 'https://trendinsight.oceanengine.com'+info_href info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com' + info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser') res_info = requests.get(info_url)
list_script = soup_info.find_all('script') soup_info = BeautifulSoup(res_info.content, 'html.parser')
for script in list_script: list_script = soup_info.find_all('script')
if 'window._SSR_DATA' in script.text: for script in list_script:
json_str = script.text if 'window._SSR_DATA' in script.text:
info_json = json.loads(json_str.replace('window._SSR_DATA = ','')) json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ', ''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url'] info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
dic_post = { dic_post = {
'title': info_title, # 报告名称 'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接 'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份 'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4) 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf 'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间 'publishDate': info_date, # 时间
'origin': '巨量算数', # 来源 'origin': '巨量算数', # 来源
'sourceAddress': info_url, # 原文链接 'sourceAddress': info_url, # 原文链接
'content': '', # 内容 'content': '', # 内容
'summary': info_zhaiyao, # 摘要 'summary': info_zhaiyao, # 摘要
'sid': '1662008524476948481', # 信息源id 'sid': '1662008524476948481', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by)
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
# # report-list # # report-list
# res = requests.post(url, data=json.dumps(dic_post)) # res = requests.post(url, data=json.dumps(dic_post))
# print(res.json()) # print(res.json())
time.sleep(2) time.sleep(2)
except Exception as e:
continue
# todo:点击下一页
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
# try:
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# except:
# time.sleep(1)
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# return getnews(browser)
def juliangsuanshu():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
getnews(browser)
browser.quit() browser.quit()
def ke36switch(browser,info_url):
try:
browser.get(info_url) # 跳到指定页面
page_source = browser.page_source # 获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
return soup_info
except:
browser.quit()
proxy = baseCore.get_proxy()
# proxy = {
# 'http': '222.90.4.73:40018',
# 'httpS': '222.90.4.73:40018'
# }
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
browser.refresh()
ke36switch(browser,info_url)
# 36氪 # 36氪
def ke36(): def ke36():
# browser = webdriver.Chrome(chromedriver) # browser = webdriver.Chrome(chromedriver)
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://36kr.com/academe' url = 'https://36kr.com/academe'
browser.get(url)#跳到指定页面 browser.get(url)#跳到指定页面
time.sleep(3)
for i in range(10):
try:
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'show-more')))
js = "var q=document.documentElement.scrollTop=3000"
browser.execute_script(js)
time.sleep(2)
browser.find_element(By.CLASS_NAME, 'show-more').click()
except:
break
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
page_source = browser.page_source#获取页面信息 page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'}) list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
for one_info in list_all: for one_info in list_all[::-1]:
info_title = one_info.find('div',{'class':'title'}).text info_title = one_info.find('div',{'class':'title'}).text
info_zhaiyao = one_info.find('div',{'class':'desc'}).text info_zhaiyao = one_info.find('div',{'class':'desc'}).text
info_url = one_info.a.get('href') info_url = one_info.a.get('href')
# is_member = r.sismember('report_pdf_three_history', info_url)
# if is_member:
# continue
soup_info = ke36switch(browser,info_url)
browser.get(info_url)#跳到指定页面 info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
if info_date < '2023-05-10':
page_source = browser.page_source#获取页面信息 pass
soup_info = BeautifulSoup(page_source, 'html.parser') else:
time.sleep(1)
info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10] continue
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text try:
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
except:
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
ke36switch(browser, info_url)
dic_post = { dic_post = {
'title': info_title, # 报告名称 'title': info_title, # 报告名称
'url_pdf': '', # 报告链接 'url_pdf': '', # 报告链接
...@@ -547,7 +774,7 @@ def ke36(): ...@@ -547,7 +774,7 @@ def ke36():
'sid': '1662008421217378306', # 信息源id 'sid': '1662008421217378306', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) # download(dic_post, order_by)
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
...@@ -555,52 +782,56 @@ def ke36(): ...@@ -555,52 +782,56 @@ def ke36():
# res = requests.post(url, data=json.dumps(dic_post)) # res = requests.post(url, data=json.dumps(dic_post))
# print(res.json()) # print(res.json())
time.sleep(2) time.sleep(2)
browser.quit() browser.quit()
# 前沿知识库 # 前沿知识库
def qianyanzhishiku(): def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml' for i in range(40,60):
log.info(f'====第{i}页====')
url = f'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
res = requests.get(url) res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser') soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li') # list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
for one_info in list_all: list_all = soup.find('div',{'class':'show_report_list'}).find_all('li')
info_title = one_info.a.get('title') for one_info in list_all:
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01') info_title = one_info.a.get('title')
info_href = one_info.a.get('href') info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
info_url = 'https://wk.askci.com'+info_href info_href = one_info.a.get('href')
info_url = 'https://wk.askci.com'+info_href
res_info = requests.get(info_url) res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser') soup_info = BeautifulSoup(res_info.content,'html.parser')
info_pdf_url = soup_info.find('iframe',{'scrolling':'auto'}).get('src').split('pdfpath=')[1] info_pdf_url = soup_info.find('iframe',{'scrolling':'auto'}).get('src').split('pdfpath=')[1]
info_pdf = urllib.parse.unquote(info_pdf_url) info_pdf = urllib.parse.unquote(info_pdf_url)
dic_post = { dic_post = {
'title': info_title, # 报告名称 'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接 'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份 'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4) 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf 'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间 'publishDate': info_date, # 时间
'origin': '前沿知识库', # 来源 'origin': '前沿知识库', # 来源
'sourceAddress': info_url, # 原文链接 'sourceAddress': info_url, # 原文链接
'content': '', # 内容 'content': '', # 内容
'summary': '', # 摘要 'summary': '', # 摘要
'sid': '1662008620631367682', # 信息源id 'sid': '1662008620631367682', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by)
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
# # report-list # # report-list
# res = requests.post(url, data=json.dumps(dic_post)) # res = requests.post(url, data=json.dumps(dic_post))
# print(res.json()) # print(res.json())
time.sleep(2) time.sleep(2)
# # 世界经济论坛 # # 世界经济论坛
...@@ -664,7 +895,7 @@ def qianyanzhishiku(): ...@@ -664,7 +895,7 @@ def qianyanzhishiku():
def shijiejingjiluntan(): def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'} allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128): for i in range(1, 2):
# res = requests.get(url) # res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser') # soup = BeautifulSoup(res.content,'html.parser')
...@@ -672,6 +903,7 @@ def shijiejingjiluntan(): ...@@ -672,6 +903,7 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}' url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面 browser.get(url) # 跳到指定页面
time.sleep(5)
wait = WebDriverWait(browser, 30) wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11"))) wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息 page_source = browser.page_source # 获取页面信息
...@@ -685,7 +917,12 @@ def shijiejingjiluntan(): ...@@ -685,7 +917,12 @@ def shijiejingjiluntan():
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime'] info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ') datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d') info_date = datetime_obj.strftime('%Y-%m-%d')
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip() # if info_date >= '2022-07-21':
# continue
try:
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
except:
info_zhaiyao = ''
try: try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href') info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except: except:
...@@ -726,6 +963,28 @@ def shijiejingjiluntan(): ...@@ -726,6 +963,28 @@ def shijiejingjiluntan():
time.sleep(2) time.sleep(2)
browser.quit() browser.quit()
def get_json(key_word,page,headers):
param = {
"uid": "",
"keyword": key_word,
"type": ["researchReport"],
"client": "web",
"clientVersion": "curr",
"clientType": "web",
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url, headers=headers).text[1:-1]
res_json = json.loads(res)
return res_json
# 东方财富网 # 东方财富网
def dongfangcaifu(): def dongfangcaifu():
headers = { headers = {
...@@ -769,101 +1028,70 @@ def dongfangcaifu(): ...@@ -769,101 +1028,70 @@ def dongfangcaifu():
page = 1 page = 1
# for page in range(1,500): # for page in range(1,500):
# log.info(page) # log.info(page)
param = { res_json_ = get_json(key_word, page, headers)
"uid": "", # 添加页数
"keyword": key_word, total = res_json_['hitsTotal']
"type": ["researchReport"], page = (total/10) + 1
"client": "web", for page_ in range(1,page+1):
"clientVersion": "curr", res_json = get_json(key_word,page_,headers)
"clientType": "web", list_all = res_json['result']['researchReport']
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
} if list_all:
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url,headers=headers).text[1:-1]
res_json = json.loads(res)
list_all = res_json['result']['researchReport']
if list_all:
pass
else:
continue
for one_news in list_all:
news_title = one_news['title']
news_title = news_title.replace('<em>', '').replace('</em>', '')
news_date = one_news['date'][:10]
comparison_date = "2023-12-08"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
else:
pass pass
news_come = one_news['source'] else:
news_code = one_news['code'] continue
for one_news in list_all:
news_title = one_news['title']
news_title = news_title.replace('<em>', '').replace('</em>', '')
news_date = one_news['date'][:10]
comparison_date = "2023-12-08"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
else:
pass
news_come = one_news['source']
news_code = one_news['code']
news_url = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}' news_url = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}'
news_res = requests.get(news_url) news_res = requests.get(news_url)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
try: try:
if '抱歉,您访问的页面不存在或已删除!' in news_soup.title.text: if '抱歉,您访问的页面不存在或已删除!' in news_soup.title.text:
continue
except:
continue continue
except: try:
continue news_content = news_soup.find('div', {'class': 'newsContent'}).text.strip()
try: except:
news_content = news_soup.find('div', {'class': 'newsContent'}).text.strip() news_content = news_soup.find('div', {'class': 'ctx-content'}).text.strip()
except:
news_content = news_soup.find('div', {'class': 'ctx-content'}).text.strip()
try: try:
news_pdf = news_soup.find('div', {'class': 'detail-header'}).find_all('a')[-1].get('href') news_pdf = news_soup.find('div', {'class': 'detail-header'}).find_all('a')[-1].get('href')
except: except:
news_pdf = news_soup.find('span', {'class': 'to-link'}).a.get('href') news_pdf = news_soup.find('span', {'class': 'to-link'}).a.get('href')
dic_post = { dic_post = {
'title': news_title, # 报告名称 'title': news_title, # 报告名称
'url_pdf': news_pdf, # 报告链接 'url_pdf': news_pdf, # 报告链接
'year': news_date[:4], # 报告年份 'year': news_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4) 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': social_code, # 关联记录id,如:企业信用代码 'item_id': social_code, # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf 'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': news_date, # 时间 'publishDate': news_date, # 时间
'origin': '东方财富网-研报中心', # 来源 'origin': '东方财富网-研报中心', # 来源
'sourceAddress': news_url, # 原文链接 'sourceAddress': news_url, # 原文链接
'content': '', # 内容 'content': '', # 内容
'summary': news_content, # 摘要 'summary': news_content, # 摘要
'sid': '1662008733005160449', # 信息源id 'sid': '1662008733005160449', # 信息源id
'come': news_come, 'come': news_come,
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by)
order_by += 1 order_by += 1
# log.info(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# log.info(res.json())
# dic_news = {
# '关键字':key_word,
# '标题':news_title,
# '时间':news_date,
# '来源':news_come,
# '摘要':news_content,
# '原文链接':news_url,
# 'PDF链接':news_pdf,
# }
# list_all_info.append(dic_news)
# if len(list_all) != 10:
# break
# 东方财富网2 # 东方财富网2
def dongfangcaifu2(): def dongfangcaifu2():
...@@ -1397,7 +1625,7 @@ if __name__ == '__main__': ...@@ -1397,7 +1625,7 @@ if __name__ == '__main__':
# try: # try:
# log.info('mob') # log.info('mob')
# Mob() # Mob()
# except: # except Exception as e:
# pass # pass
# try: # try:
# log.info('yidong_guanxiangtai') # log.info('yidong_guanxiangtai')
...@@ -1407,24 +1635,25 @@ if __name__ == '__main__': ...@@ -1407,24 +1635,25 @@ if __name__ == '__main__':
# try: # try:
# log.info('juliangsuanshu') # log.info('juliangsuanshu')
# juliangsuanshu() # juliangsuanshu()
# except: # except Exception as e:
# pass # pass
# try: # try:
# log.info('ke36') # log.info('ke36')
# ke36() # ke36()
# except: # except Exception as e:
# ke36()
# pass # pass
# try: # try:
# log.info('qianyanzhishiku') # log.info('qianyanzhishiku')
# qianyanzhishiku() # qianyanzhishiku()
# except:
# pass
# try:
# log.info('shijiejingjiluntan')
# shijiejingjiluntan()
# except Exception as e: # except Exception as e:
# log.info(e)
# pass # pass
try:
log.info('shijiejingjiluntan')
shijiejingjiluntan()
except Exception as e:
log.info(e)
pass
# try: # try:
# log.info('dongfangcaifu') # log.info('dongfangcaifu')
# dongfangcaifu() # dongfangcaifu()
...@@ -1442,31 +1671,31 @@ if __name__ == '__main__': ...@@ -1442,31 +1671,31 @@ if __name__ == '__main__':
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
# try: # try:
# log.info('dongfangcaifu4') # log.info('dongfangcaifu4')
# dongfangcaifu4() # dongfangcaifu4()
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
try: # try:
log.info('dongfangcaifu5') # log.info('dongfangcaifu5')
dongfangcaifu5() # dongfangcaifu5()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
#
try: # try:
log.info('dongfangcaifu6') # log.info('dongfangcaifu6')
dongfangcaifu6() # dongfangcaifu6()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
#
try: # try:
log.info('dongfangcaifu7') # log.info('dongfangcaifu7')
dongfangcaifu7() # dongfangcaifu7()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
...@@ -53,12 +53,12 @@ class EsMethod(object): ...@@ -53,12 +53,12 @@ class EsMethod(object):
# 'hits.hits._source.createDate', # 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate', # 'hits.hits._source.publishDate',
] # 字段2 ] # 字段2
result = self.es.search(index=index_name resultb = self.es.search(index=index_name
, doc_type='_doc' , doc_type='_doc'
, filter_path=filter_path , filter_path=filter_path
, body=body) , body=body)
# log.info(result) # log.info(result)
return result return resultb
def updateaunn(self, index_name, id, content, contentWithTag): def updateaunn(self, index_name, id, content, contentWithTag):
body = { body = {
...@@ -67,24 +67,28 @@ class EsMethod(object): ...@@ -67,24 +67,28 @@ class EsMethod(object):
'contentWithTag': contentWithTag 'contentWithTag': contentWithTag
} }
} }
result = self.es.update(index=index_name resulta = self.es.update(index=index_name
,id=id ,id=id
,body=body) ,body=body)
log.info('更新结果:%s' % result) log.info('更新结果:%s' % resulta)
def paserUrl(html,listurl): def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser') # soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签 # 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img']) links = html.find_all(['a', 'img'])
print(len(links))
# 遍历标签,将相对地址转换为绝对地址 # 遍历标签,将相对地址转换为绝对地址
for link in links: for link in links:
print(link)
if 'href' in link.attrs: if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href']) # link['href'] = urljoin(listurl, link['href'])
pass
elif 'src' in link.attrs: elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src']) pass
# link['src'] = urljoin(listurl, link['src'])
return html return html
def get_news(news_url,ip_dic): def get_news(news_url,sourceAddress,id):
header = { header = {
'Host': 'www.sec.gov', 'Host': 'www.sec.gov',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic): ...@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic):
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D' 'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
} }
response = requests.get(url=news_url,headers=header,verify=False,timeout=30) response = requests.get(url=news_url,headers=header,verify=False)
# aa = response.text
# print(response.text)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30) # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if response.status_code == 200: if response.status_code == 200:
# 请求成功,处理响应数据 # 请求成功,处理响应数据
# print(response.text) # print(response.text)
result = BeautifulSoup(response.content,'html.parser') # result_ = BeautifulSoup(response.content,'html.parser')
result_ = BeautifulSoup(response.text, 'lxml')
# print(result) # print(result)
pass pass
else: else:
# 请求失败,输出错误信息 # 请求失败,输出错误信息
log.info('请求失败:', response.status_code, response.text) log.info('请求失败:', response.status_code, response.text)
result = '' result_ = ''
return result if result_:
pass
# 相对路径转化为绝对路径
# soup = paserUrl(result_, sourceAddress)
time.sleep(2)
content = result_.text.strip()
# del(result_)
# content = result_
# print(content)
time.sleep(2)
esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))
def main(esMethod): def main(esMethod):
redis_conn = redis.Redis(connection_pool=pool) redis_conn = redis.Redis(connection_pool=pool)
id_ = redis_conn.lpop('NianbaoUS:id') id_ = redis_conn.lpop('NianbaoUS:id')
id = id_.decode()
# id = "23101317164" # id = "23101317164"
if id: if id_:
pass pass
else: else:
log.info('已无数据') log.info('已无数据')
return return False
id = id_.decode()
result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id) result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
result = result_['hits']['hits'][0] result = result_['hits']['hits'][0]
num = 0 num = 0
...@@ -135,17 +153,8 @@ def main(esMethod): ...@@ -135,17 +153,8 @@ def main(esMethod):
log.info(f'====={title}=={social_code}===正在更新===') log.info(f'====={title}=={social_code}===正在更新===')
sourceAddress = result['_source']['sourceAddress'] sourceAddress = result['_source']['sourceAddress']
ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'} ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
soup = get_news(sourceAddress,ip_dic) get_news(sourceAddress,sourceAddress,id)
if soup: return True
pass
else:
return
# 相对路径转化为绝对路径
soup = paserUrl(soup, sourceAddress)
content = soup.text.strip()
esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
return
def run_threads(num_threads,esMethod): def run_threads(num_threads,esMethod):
...@@ -164,6 +173,9 @@ if __name__ == '__main__': ...@@ -164,6 +173,9 @@ if __name__ == '__main__':
while True: while True:
esMethod = EsMethod() esMethod = EsMethod()
start = time.time() start = time.time()
num_threads = 5 # num_threads = 5
run_threads(num_threads,esMethod) # run_threads(num_threads,esMethod)
log.info(f'5线程 总耗时{time.time()-start}秒') # log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file result = main(esMethod)
if not result:
break
\ No newline at end of file
# 证监会沪市、gong深市 公司债券和企业债券采集 # 证监会沪市、深市 公司债券和企业债券采集
"""
证监会企业名单
"""
import time import time
import random import random
import requests import requests
...@@ -25,7 +22,7 @@ cursor = baseCore.cursor ...@@ -25,7 +22,7 @@ cursor = baseCore.cursor
cnx_ = baseCore.cnx_ cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_ cursor_ = baseCore.cursor_
taskType = '企业名单/证监会' taskType = '企业债券/证监会'
def createDriver(): def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe' chrome_driver = r'D:\cmd100\chromedriver.exe'
...@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信 ...@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信
page = soup.find('div', class_='pages').find_all('li')[-1] page = soup.find('div', class_='pages').find_all('li')[-1]
total = page.find('b').text total = page.find('b').text
for i in range(1,int(total)+1): # for i in range(1,int(total)+1):
for i in range(224, 225):
log.info(f'==========正在采集第{i}页=========') log.info(f'==========正在采集第{i}页=========')
if i == 1: if i == 1:
href = url href = url
...@@ -241,7 +239,7 @@ if __name__ == '__main__': ...@@ -241,7 +239,7 @@ if __name__ == '__main__':
# url_parms = ['201010', '201014'] # url_parms = ['201010', '201014']
# url_parms = ['201011', '201013'] # url_parms = ['201011', '201013']
url_parms = ['201411', '201414', '202011', '202014'] url_parms = ['201411', '201414', '202011', '202014']
# url_parms = ['202011', '202014'] # url_parms = ['201411']
for url_parm in url_parms: for url_parm in url_parms:
url = getUrl(url_parm) url = getUrl(url_parm)
......
import yfinance as yf
# 获取股票数据
stock = yf.Ticker("MET")
# 获取资产负债表数据
balance_sheet = stock.balance_sheet
# 获取报告日期
report_dates = balance_sheet.index
print(report_dates)
# 获取现金流量表数据
cashflow_statement = stock.cashflow
# 获取利润表数据
income_statement = stock.financials
print(balance_sheet)
print(cashflow_statement)
print(income_statement)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
...@@ -57,8 +57,8 @@ def page_list(): ...@@ -57,8 +57,8 @@ def page_list():
'Content-Length': '25', 'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks', 'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap', 'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5', 'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'xweb_xhr': '1', 'xweb_xhr': '1',
'dgd-pre-release': '0', 'dgd-pre-release': '0',
...@@ -69,11 +69,11 @@ def page_list(): ...@@ -69,11 +69,11 @@ def page_list():
'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html', 'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br' 'Accept-Encoding': 'gzip, deflate, br'
} }
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList' url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,453): for i in range(1,2):
log.info(f'采集第{i}页数据') log.info(f'采集第{i}页数据')
k=i k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}' da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
...@@ -110,8 +110,8 @@ def detailpaser(dmsg): ...@@ -110,8 +110,8 @@ def detailpaser(dmsg):
'Content-Length': '25', 'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks', 'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap', 'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5', 'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'xweb_xhr': '1', 'xweb_xhr': '1',
'dgd-pre-release': '0', 'dgd-pre-release': '0',
...@@ -122,7 +122,7 @@ def detailpaser(dmsg): ...@@ -122,7 +122,7 @@ def detailpaser(dmsg):
'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html', 'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br' 'Accept-Encoding': 'gzip, deflate, br'
} }
try: try:
......
import json
import time
import uuid
import pymysql
import redis
import requests
from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings()
from obs import ObsClient
import fitz
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = baseCore.cnx_
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
pathType = 'CrowDingZhi/'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 数据入库,返回主键id传到kafka中
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
# log.info(values)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
# cnx.close()
# log.info("更新完成:{}".format(pdf_id))
return pdf_id
def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
# response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
response = requests.get(pdf_url)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(getuuid()) + '.' + category
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
if category == 'pdf':
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
else:
page_size = 0
retData['content'] = ''
break
except Exception as e:
time.sleep(3)
continue
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
log.info(f'error---{e}')
return retData
return retData
if __name__ == "__main__":
num = 0
t = int(time.time()*1000)
url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req_ = requests.get(url=url_, headers=headers)
data_json = req_.json()
print(data_json)
pageCount = data_json['pageHelp']['pageCount']
for i in range(1,int(pageCount + 1)):
url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req = requests.get(url=url, headers=headers)
data_list = req.json()['result']
for info in data_list:
publishDate = info['cmsOpDate'] # 处理日期
year = publishDate[:4]
com = '上海证券交易所'
docTitle = info['docTitle'] # 处理事由
docType = info['docType'] # 文档类型
docURL = "http://" + info['docURL'] # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
flag = r.sismember('IN-20231227-0001', docURL)
if flag:
log.info('信息已采集入库过')
continue
# 上传至obs
retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
path = retData['path']
full_path = retData['full_path']
file_size = retData['file_size']
create_by = retData['create_by']
content = retData['content']
status = 1
num += 1
create_time = time_now
# 上传到附件表
att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
if att_id:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sid = '1739914218978594817'
info_code = "IN-20231227-0001"
dic_news = {
'attachmentIds': str(att_id),
'content': content,
'contentWithTag': '',
'id': '',
'origin': com,
'publishDate': publishDate,
'sid': sid,
'sourceAddress': docURL,
'title': docTitle,
'source':'16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r.sadd(info_code, docURL)
continue
# 中央全面深化改革委员会会议
import json
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'www.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
soup = BeautifulSoup(request.content, 'html.parser')
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
year = int(publishDate[:4])
if year < 2023:
continue
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
'origin': source,
'contentWithTag': str(contentwithTag),
'content': content,
'summary': summary,
'publishDate': publishDate,
'sid': sid,
'subjectId': '1681549361661489154',
'sourceAddress':newsUrl,
'checkStatus': 1,
'deleteFlag': 0,
'createDate': time_now,
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
finally:
producer.close()
\ No newline at end of file
...@@ -27,35 +27,25 @@ class EsMethod(object): ...@@ -27,35 +27,25 @@ class EsMethod(object):
def __init__(self): def __init__(self):
# 创建Elasticsearch对象,并提供账号信息 # 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300) self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'policy' self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum): def queryatt(self,index_name,pnum):
body = { body = {
"query": { "query": {
"bool": {
"must": [ "bool": {
{ "must": [
"nested" : { {
"query" : { "term": {
"bool" : { "sid.keyword": {
"must" : [ "value": "1662008524476948481"
{ }
"match_phrase" : { }
"labels.relationId" : { }
"query" : "1698" ]
} }
} },
} "size":0,
]
}
},
"path" : "labels"
}
}
]
}
},
"size":0,
"aggs":{ "aggs":{
"duplicate_titles":{ "duplicate_titles":{
"terms":{ "terms":{
...@@ -112,7 +102,7 @@ def main(page, p, esMethod): ...@@ -112,7 +102,7 @@ def main(page, p, esMethod):
unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents] unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
# 删除重复的文档 # 删除重复的文档
for doc_id in unique_document_ids: for doc_id in unique_document_ids:
esMethod.delete(index_name="policy", id=doc_id) esMethod.delete(index_name="researchreportdata", id=doc_id)
......
...@@ -121,7 +121,7 @@ def get_content2(): ...@@ -121,7 +121,7 @@ def get_content2():
except Exception as e: except Exception as e:
log.info(f'---{href}--------{e}-------') log.info(f'---{href}--------{e}-------')
continue continue
if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href: if '.wps' in file_href or '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name:
......
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/国内上市'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
social_code = '91130629MA0CG2DL51'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
\ No newline at end of file
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/中国500强'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:zg500shSocial_code')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
...@@ -56,7 +56,7 @@ if __name__=="__main__": ...@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
# 可改动 # 可改动
time.sleep(10) time.sleep(20)
s = requests.session() s = requests.session()
#获取到token和cookies #获取到token和cookies
......
...@@ -239,6 +239,8 @@ if __name__=="__main__": ...@@ -239,6 +239,8 @@ if __name__=="__main__":
list_all_info = [] list_all_info = []
while True: while True:
#一次拿取一篇文章 #一次拿取一篇文章
# todo: 从redis拿数据 更新mysql状态
dict_json =getjsonInfo() dict_json =getjsonInfo()
if dict_json: if dict_json:
if get_info(dict_json): if get_info(dict_json):
......
...@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page): ...@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page):
cnx_.commit() cnx_.commit()
except Exception as e: except Exception as e:
log.error(f"保存数据库失败:{e}") log.error(f"保存数据库失败:{e}")
# todo: 放入redis
log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------") log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount==0: if listCount==0:
#列表为空认为结束 #列表为空认为结束
......
from bs4 import BeautifulSoup
import requests,time,re
from base import BaseCore
# import pandas as pd
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
log = baseCore.getLogger()
taskType = '500强专利'
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
# df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
def name_handle(english_name_):
if 'INC.' in english_name_ or 'LTD.' in english_name_ or 'CO.' in english_name_ \
or 'CORP.' in english_name_ or 'GMBH' in english_name_ \
or ' AG' in english_name_ or 'SARL' in english_name_ or 'S.A.' in english_name_ \
or 'PTY' in english_name_ or 'LLC' in english_name_ or 'LLP' in english_name_ \
or ' AB' in english_name_ or ' NV' in english_name_ or 'N.V.' in english_name_ \
or 'A.S.' in english_name_ or ' SA' in english_name_ or ',Limited' in english_name_ \
or ' SE' in english_name_ or ' LPC' in english_name_ or 'S.P.A.' in english_name_:
english_name = english_name_.replace('INC.', '').replace('LTD.', '').replace('CO.', '').replace('CORP.', '') \
.replace('GMBH', '').replace(' AG', '').replace('SARL', '').replace('S.A.', '').replace('PTY', '') \
.replace('LLC', '').replace('LLP', '').replace(' AB', '').replace(' NV', '').replace(',', '') \
.replace('A.S.', '').replace(' SA', '').replace(',Limited', '').replace(' SE', '').replace(' PLC', '') \
.replace('N.V.', '').replace('S.P.A.', '').rstrip()
return english_name
else:
english_name = english_name_
return english_name
if __name__ == '__main__':
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gwSocial_code')
# social_code = '9111000071093123XX'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
english_name_ = data[5]
place = data[6]
if place == 1:
log.info(f'{com_name}--国内')
baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
continue
if english_name_:
pass
else:
query = f"select * from sys_base_enterprise where social_credit_code ='{xydm}'"
cursor_.execute(query)
reslut = cursor_.fetchone()
english_name_ = reslut[32]
# todo:将该字段更新到144企业库
update_ = f"update EnterpriseInfo set EnglishName='{english_name_}' where SocialCode='{xydm}' "
cursor.execute(update_)
cnx.commit()
english_name_ = english_name_.upper()
english_name = name_handle(english_name_)
num_zhuanli = 0
# url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
# int(float(time.time()) * 1000))
#
# res1 = requests.get(url1, headers=headers)
# soup1 = BeautifulSoup(res1.content, 'html.parser')
#
# num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
#
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
# if zhuanli:
for year in range(2023, 1900, -1):
url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={english_name}&IN=&CPC=&IC=&rnd=' + str(
int(float(time.time()) * 1000))
# url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli == 0:
dic_info = {
'com_name': com_name,
'social_code': social_code,
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-----已经存在--{year}--无专利信息")
break
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}------新增----无专利信息")
break
dic_info = {
'com_name': com_name,
'social_code': social_code,
'year': year,
'num': zhuanli
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
except:
log.info("error!{}".format(social_code))
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
\ No newline at end of file
import requests,re,time,os,datetime,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import redis
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
for i in range(len(df_all['英文名称'])):
for num in range(0, 2):
try:
if '中国' not in df_all['企业所属国家'][i]:
com_name = df_all['英文名称'][i]
num_zhuanli = 0
url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
int(float(time.time()) * 1000))
res1 = requests.get(url1, headers=headers)
soup1 = BeautifulSoup(res1.content, 'html.parser')
num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
zhuanli = '10000'
if zhuanli == '10000':
for year in range(2023, 1900, -1):
# url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={com_name}&IN=&CPC=&IC=&rnd=' + str(
# int(float(time.time()) * 1000))
url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli2 = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli2 == 0:
break
df_all[f'{year}'][i] = zhuanli2
# num_zhuanli = num_zhuanli + zhuanli2
num_zhuanli = num_zhuanli + zhuanli2
print(year)
time.sleep(random.uniform(1.5, 2))
else:
num_zhuanli = int(zhuanli)
time.sleep(random.uniform(1.5, 2))
df_all['Espacenet专利检索'][i] = num_zhuanli
print(f"{com_name} : {num_zhuanli}")
break
except:
if num == 0:
print("重试")
time.sleep(60)
continue
else:
print("error!{}".format(df_all['英文名称'][i]))
\ No newline at end of file
import requests,time,re,random import functools
import random
import threading
import traceback
import pymysql
import requests,time
from base import BaseCore from base import BaseCore
import pandas as pd import concurrent.futures
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx # cnx = baseCore.cnx
cursor = baseCore.cursor # cursor = baseCore.cursor
log = baseCore.getLogger() log = baseCore.getLogger()
taskType = '天眼查专利/国内上市' taskType = '天眼查专利/国内榜单'
# 需调整放入国外和国内的redis
# 设置一个全局变量用于控制线程退出
should_exit = False
def connectSql():
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
return cnx,cursor
#关闭数据库连接
def closeSql(cnx,cursor):
cnx.close()
cursor.close()
# 获取代理
def get_proxy():
cnx,cursor = connectSql()
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
cnx.commit()
closeSql(cnx,cursor)
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 4)]
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info): def spider_zhuanli(com_name, social_code, tycid):
page = 1
start_time = time.time() start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000) t = int(time.time() * 1000)
header = { header = {
...@@ -36,171 +78,259 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info): ...@@ -36,171 +78,259 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
'sec-ch-ua-platform': '"Windows"', 'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web' 'version': 'TYC-Web'
} }
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100' while True:
log.info(f'===正在处理第{page}页===')
try: url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
ip = baseCore.get_proxy() try:
except: ip = get_proxy()
time.sleep(2) except:
ip = baseCore.get_proxy() time.sleep(2)
try: ip = get_proxy()
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json() try:
except: res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
for i in range(3): except:
try: for i in range(3):
res_j = requests.get(url=url, headers=header, verify=False).json() try:
except: res_j = requests.get(url=url, headers=header, verify=False).json()
time.sleep(2) except:
continue time.sleep(2)
# print(res_j) continue
list_all = res_j['data']['items'] # print(res_j)
# print(list_all) try:
if list_all: list_all = res_j['data']['items']
for one_zhuanli in list_all: except:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = { dic_info = {
'企业名称': com_name, '企业名称': com_name,
'统一信用代码': social_code, '统一信用代码': social_code
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
} }
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' " cnx, cursor = connectSql()
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
# lock.acquire()
cursor.execute(selectSql) cursor.execute(selectSql)
count = cursor.fetchone()[0] count = cursor.fetchone()[0]
closeSql(cnx, cursor)
# lock.release()
if count > 0: if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在") log.info(f"{com_name}---{social_code}---已经存在---无专利")
continue log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
break
else: else:
values_tuple = tuple(dic_info.values()) values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增") # log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cnx, cursor = connectSql()
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
# lock.acquire()
cursor.execute(insertSql, values_tuple) cursor.execute(insertSql, values_tuple)
cnx.commit() cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增") # lock.release()
time.sleep(2) closeSql(cnx,cursor)
# list_all_info.append(dic_info) log.info(f"{com_name}---{social_code}---新增---无专利")
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}") log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
return page break
else:
return 0
if __name__ == "__main__": if list_all:
while True: for one_zhuanli in list_all:
list_all_info = [] title = one_zhuanli['title']
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 try:
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code') shenqingri = one_zhuanli['applicationTime']
# social_code = '9111010566840059XP' except:
# 判断 如果Redis中已经没有数据,则等待 shenqingri = ''
if social_code == None: try:
# time.sleep(20) shenqing_code = one_zhuanli['patentNum']
break except:
start = time.time() shenqing_code = ''
try: try:
data = baseCore.getInfomation(social_code) leixing = one_zhuanli['patentType']
if len(data) != 0: except:
pass leixing = ''
else: try:
# 数据重新塞入redis status = one_zhuanli['lprs']
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code) except:
continue status = ''
id = data[0] try:
com_name = data[1] gongkairi = one_zhuanli['pubDate']
xydm = data[2] except:
tycid = data[11] gongkairi = ''
if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(xydm) gongkai_code = one_zhuanli['pubnumber']
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except: except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
cnx, cursor = connectSql()
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
# lock.acquire()
cursor.execute(selectSql)
count = cursor.fetchone()[0]
# lock.release()
closeSql(cnx,cursor)
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
cnx,cursor = connectSql()
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
closeSql(cnx,cursor)
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
page+=1
else:
log.info(f"---{social_code}----{tycid}--共{page}页--结束处理")
break
def runSpider():
# 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
social_code = '91360400794798498A'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
# 任务执行结束后设置should_exit为True
global should_exit
should_exit = True
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
return False
id = data[0]
com_name = data[1]
tycid = data[11]
place = data[6]
if place != 1:
baseCore.rePutIntoR('Zhuanli:gwSocial_code', social_code)
return False
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(social_code)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
cnx,cursor = connectSql()
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{social_code}'"
cursor.execute(updateSql)
cnx.commit()
closeSql(cnx,cursor)
elif not retData['tycData'] and retData['reput']:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code) log.info(f'======={social_code}====重新放入redis====')
continue baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
count = data[17] return False
log.info(f"{id}---{xydm}----{tycid}----开始处理") elif not retData['reput'] and not retData['tycData']:
page = 1 return False
while True: except:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info) state = 0
if page != 0: takeTime = baseCore.getTimeCost(start, time.time())
page += 1 baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
return False
else: log.info(f"{id}---{social_code}----{tycid}----开始处理")
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info) spider_zhuanli(com_name, social_code, tycid)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理") except Exception as e:
break
except Exception as e: traceback.print_exc()
log.info(f'==={social_code}=====获取企业信息失败==={e}=') log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code) baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
finally:
# global should_exit
# should_exit = True
return
# if __name__ == "__main__":
# while True:
# # 创建一个线程池,指定线程数量为4
# with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# results = []
# while True:
# # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# # social_code = '91350700856994874M'
# # 判断 如果Redis中已经没有数据,则等待
# if social_code == None:
# # time.sleep(20)
# break
#
# future = executor.submit(runSpider, social_code)
# results.append(future)
# # 获取任务的执行结果
# for future in concurrent.futures.as_completed(results):
# try:
# result = future.result()
# # 处理任务的执行结果
# print(f"任务执行结束: {result}")
# except Exception as e:
# # 处理任务执行过程中的异常
# # print(f"任务执行exception: {e}")
# traceback.print_exc()
def run_threads(num_threads):
threads = []
for i in range(num_threads):
thread = threading.Thread(target=runSpider)
threads.append(thread)
thread.start()
# while True:
# if should_exit:
# break
for thread in threads:
thread.join()
if __name__ == '__main__':
while True:
start = time.time()
num_threads = 1
run_threads(num_threads)
log.info(f'5线程 总耗时{time.time()-start}秒')
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
try:
list_all = res_j['data']['items']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
return 0
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}---{social_code}---新增---无专利")
return 0
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code_zg500')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
...@@ -13,9 +13,10 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut ...@@ -13,9 +13,10 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka] [kafka]
bootstrap_servers=114.115.159.144:9092 bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo topic=keyWordsInfo
groupId=python_baidu_test groupId=python_google
[selenium] [selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe ;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe ;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
...@@ -168,6 +168,8 @@ class GoogleSpider(object): ...@@ -168,6 +168,8 @@ class GoogleSpider(object):
try: try:
driver.get(url) driver.get(url)
# 等待页面加载完成 # 等待页面加载完成
time.sleep(3)
driver.refresh()
wait = WebDriverWait(driver, 20) wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
html=driver.page_source html=driver.page_source
...@@ -256,6 +258,7 @@ class GoogleSpider(object): ...@@ -256,6 +258,7 @@ class GoogleSpider(object):
self.driver.get(self.url) self.driver.get(self.url)
# 等待页面加载完成 # 等待页面加载完成
time.sleep(3) time.sleep(3)
self.driver.refresh()
wait = WebDriverWait(self.driver, 20) wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
search_input = self.driver.find_element('xpath', '//textarea[@title="Google 搜索"]') search_input = self.driver.find_element('xpath', '//textarea[@title="Google 搜索"]')
...@@ -265,7 +268,11 @@ class GoogleSpider(object): ...@@ -265,7 +268,11 @@ class GoogleSpider(object):
time.sleep(3) time.sleep(3)
wait = WebDriverWait(self.driver, 20) wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click() try:
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click()
except:
self.driver.find_element('xpath', '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span').click()
time.sleep(3) time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click() self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
time.sleep(2) time.sleep(2)
...@@ -273,7 +280,8 @@ class GoogleSpider(object): ...@@ -273,7 +280,8 @@ class GoogleSpider(object):
time.sleep(2) time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click() self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e: except Exception as e:
print(e) self.logger.info(f'--点击按钮失效----{e}')
return
self.logger.info("开始抓取首页..." + self.searchkw ) self.logger.info("开始抓取首页..." + self.searchkw )
time.sleep(5) time.sleep(5)
flag, lists = self.parse_page() flag, lists = self.parse_page()
...@@ -446,7 +454,7 @@ class GoogleSpider(object): ...@@ -446,7 +454,7 @@ class GoogleSpider(object):
detailurl=detailmsg['detailUrl'] detailurl=detailmsg['detailUrl']
title = detailmsg['title'] title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title) content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag) contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e: except Exception as e:
content='' content=''
contentWithTag='' contentWithTag=''
......
...@@ -40,7 +40,7 @@ class GoogleTaskJob(object): ...@@ -40,7 +40,7 @@ class GoogleTaskJob(object):
try: try:
for record in consumer: for record in consumer:
try: try:
logger.info("value:",record.value) logger.info(f"value:{record.value}")
keymsg=record.value keymsg=record.value
if keymsg: if keymsg:
break break
...@@ -176,7 +176,7 @@ if __name__ == '__main__': ...@@ -176,7 +176,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList] results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
import requests
url = 'https://www.ctwant.com/article/308534'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
req = requests.get(url,headers)
print(req.text)
\ No newline at end of file
...@@ -113,23 +113,23 @@ if __name__=='__main__': ...@@ -113,23 +113,23 @@ if __name__=='__main__':
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '') author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except: except:
continue continue
# if len(author)>4: if len(author)>4:
# continue continue
# if '(' in author or '本刊' in author or '国家' in author\ # if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\ # or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\ # or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author : # or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# if '(' in author or '本刊' in author \ if '(' in author or '本刊' in author \
# or '记者' in author or '新闻社' in author \ or '记者' in author or '新闻社' in author \
# or '”' in author\ or '”' in author\
# or '大学' in author or '洛桑江村' in author: or '大学' in author or '洛桑江村' in author:
# continue
if '国资委党委' in author:
pass
else:
continue continue
# if '国资委党委' in author:
# pass
# else:
# continue
new_href = new.find('a')['href'] new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href) is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
if is_member: if is_member:
continue continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '') new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
...@@ -165,7 +165,7 @@ if __name__=='__main__': ...@@ -165,7 +165,7 @@ if __name__=='__main__':
} }
log.info(dic_news) log.info(dic_news)
if sendKafka(dic_news): if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href) r.sadd('qiushileaderspeech_two::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}') log.info(f'采集成功----{dic_news["sourceAddress"]}')
...@@ -55,56 +55,56 @@ from obs import ObsClient ...@@ -55,56 +55,56 @@ from obs import ObsClient
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() # baseCore = BaseCore()
log = baseCore.getLogger() # log = baseCore.getLogger()
cnx_ = baseCore.cnx # cnx_ = baseCore.cnx
cursor_ = baseCore.cursor # cursor_ = baseCore.cursor
#
def use_ocr(img): # def use_ocr(img):
ocr = ddddocr.DdddOcr() # ocr = ddddocr.DdddOcr()
with open(img, 'rb') as f: # with open(img, 'rb') as f:
image = f.read() # image = f.read()
res = ocr.classification(image) # res = ocr.classification(image)
print(res) # print(res)
return res # return res
#
if __name__=="__main__": # if __name__=="__main__":
requests.DEFAULT_RETRIES = 5 # requests.DEFAULT_RETRIES = 5
time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'开始时间为:{time_start}') # log.info(f'开始时间为:{time_start}')
#
requests.adapters.DEFAULT_RETRIES = 3 # requests.adapters.DEFAULT_RETRIES = 3
headers = { # headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
} # }
#
opt = webdriver.ChromeOptions() # opt = webdriver.ChromeOptions()
opt.add_argument( # opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36') # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
#
opt.add_argument("--ignore-certificate-errors") # opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors") # opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"]) # opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging']) # opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False) # opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe' # opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe' # chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) # browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "http://zxgk.court.gov.cn/shixin/" # url = "http://zxgk.court.gov.cn/shixin/"
browser.get(url) # browser.get(url)
# 可改动 # # 可改动
time.sleep(20) # time.sleep(20)
page_source = browser.page_source # page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser') # soup = BeautifulSoup(page_source, 'html.parser')
img_url = soup.select('img[id="captchaImg"]')[0]['src'] # img_url = soup.select('img[id="captchaImg"]')[0]['src']
#
browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司') # browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
#
#
browser.find_element(By.ID, 'yzm').send_keys(yzm) # browser.find_element(By.ID, 'yzm').send_keys(yzm)
browser.find_element(By.ID, 'searchForm').click() # browser.find_element(By.ID, 'searchForm').click()
wait = WebDriverWait(browser, 30) # wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# screen_img_path = "D:/screen/xxx.png" # screen_img_path = "D:/screen/xxx.png"
# out_img_path = "D:/out/xxx.png" # out_img_path = "D:/out/xxx.png"
...@@ -112,3 +112,27 @@ if __name__=="__main__": ...@@ -112,3 +112,27 @@ if __name__=="__main__":
# #
# code = use_ocr(out_img_path) # code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code) # 验证码输入框元素.send_keys(code)
import requests
headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
with open('./a.pdf','wb') as f:
f.write(res.content)
\ No newline at end of file
#百度翻译 不登录翻译1000字 登录翻译5000字
#百度翻译 不登录翻译1000字 登录翻译5000字
import re
import string
import time
import pymongo
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
""""
initialize the class, and include the fundamental attributes
"""
# self._lang_list = ['zh', 'en', 'kor', 'fra', 'jp', 'el', 'ru']
# self._lang_list_original = ["中文", "英语", "韩语", "法语", "日语", "希腊语", "俄语"]
# self._num = len(self._lang_list)
self.url = "https://fanyi.baidu.com/#{}/{}/{}"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软['数据源_0106']
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def translate(self, sentence, browser, lang):
sentence_ = sentence
# browser = self.createDriver()
wait = WebDriverWait(browser, 20)
try:
word_type = self.get_input_language_type(sentence_, browser, wait)
except:
browser.quit()
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
if word_type:
if word_type == lang:
pass
else:
word_type = lang
url = self.url.format(word_type, 'zh', sentence_)
browser.set_page_load_timeout(10)
try:
browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
print(f'翻译后的句子:{result}')
return result, browser
except:
browser.quit()
print(f'翻译失败,重新翻译。当前句子为{sentence_}')
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
def get_input_language_type(self, word, browser, wait):
browser.get("https://fanyi.baidu.com/")
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
word_type = word_type.get_attribute("data-lang")
return word_type
def is_punctuation(self, char):
punctuation = string.punctuation + '、' + '(' + '…' + ')' + '《' + '》' + '“' + '”' + ':' + ';' + '!' + ' ' + '。'
return char in punctuation
def sentence_split_sentence(self, contentWithTag):
pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
match_group = pattern.finditer(contentWithTag)
sentences = []
if match_group:
for _ in match_group:
start_end_index = _.span()
sentences.append((start_end_index[0], start_end_index[1], _.group()))
if (not sentences) and (len(contentWithTag) >= 4):
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self):
# data = self.db_storage.find_one({'titleForeign':{'$ne':''}})
try:
browser = self.createDriver()
except:
browser = self.createDriver()
datas = self.db_storage.find({'postCode': '2', 'newsTime': {'$gte': '2024-01-01', '$lt': '2024-01-02'}}).limit(10)
for data in datas:
contentWithTag = data['richTextForeign']
# 根据分段符\n拆分,拿取纯文本,翻译
# # 拆分成段
# # pattern1 = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
# sentence_list = self.sentence_split_sentence(contentWithTag)
# print(sentence_list)
# # 每段拆分成标签
# result_list = []
# # for sentence_tag in tqdm(sentence_list):
# sentence_xml = BeautifulSoup(sentence_tag[2], 'lxml')
# for tag in sentence_xml.find_all(text=True):
# sentence =
# if len(sentence.strip()) == 0:
# # # print(f'aa当前内容为:{sentence}')
# result = sentence.strip()
# sentence_xml.text.replace(sentence, result)
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": result,
# "sentence_xml": sentence_xml
# })
# elif self.is_punctuation(sentence.strip()) or len(sentence.strip()) == 1:
# # # print(f'bb当前内容为:{sentence}')
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": sentence,
# "sentence_xml": sentence_xml
# })
# else:
# # 翻译文本
# result = self.translate(sentence)
# new_xml = sentence_tag[2].replace(sentence, result)
#
# result_list.append({
# "start_index": sentence_tag[0],
# # "sentence": sentence + "\n",
# "sentence": result,
# "sentence_xml": new_xml
# })
# # todo: 对内容进行排序,保证顺序对
# sorted_context_list = sorted(result_list, key=lambda x: x["start_index"])
# final_list = [item["sentence_xml"] for item in sorted_context_list]
#
# return f'\n'.join(final_list)
# paragraphs = self.jionstr(contentWithTag)
html = BeautifulSoup(contentWithTag, 'html.parser')
content = html.text
lang = baseCore.detect_language(content)
for tag in html.find_all(text=True):
sentence = str(tag)
# sentence = " 実際に働き手の数が8がけ(8割)になる16年後、介護のようなケアサービスを今のような形で受けることは困難になると予測される。"
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# if len(sentence) > 1000:
if len(sentence) > 50:
print(len(sentence))
# index_1000 = sentence[999]
index_1000 = sentence[49]
# 判断该字符是不是逗号或句号
if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == ',':
# 如果是标点符号
# print(f'当前的段1:{sentence[:1000]}')
print(f'当前的段1:{sentence[:50]}')
# result1, browser = self.translate(sentence[:1000].strip(), browser, lang)
result1, browser = self.translate(sentence[:50].strip(), browser, lang)
# print(f'当前的段2:{sentence[1000:]}')
print(f'当前的段2:{sentence[50:]}')
# result2, browser = self.translate(sentence[1000:].strip(), browser, lang)
result2, browser = self.translate(sentence[50:].strip(), browser, lang)
tag.replace_with(result1+result2)
else:
# 如果不是标点符号
# i = 1000
i = 50
while i >= 0:
j = i-1
if j <= 0:
break
index_punctuation = sentence[j]
if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == ',':
print(f'当前的段3:{sentence[:j+1]}')
result1, browser = self.translate(sentence[:j+1].strip(), browser, lang)
print(f'当前的段4:{sentence[j+1:]}')
result2, browser = self.translate(sentence[j+1:].strip(), browser, lang)
tag.replace_with(result1+result2)
break
else:
i = j
continue
if i == 1:
print(f'当前的段5:{sentence}')
# result, browser = self.translate(sentence[:1000].strip(), browser, lang)
result, browser = self.translate(sentence[:50].strip(), browser, lang)
tag.replace_with(result)
continue
else:
# 翻译
print(f'当前的段6:{sentence}')
result, browser = self.translate(sentence, browser, lang)
# 替换
tag.replace_with(result)
time.sleep(2)
print(html.prettify())
# return html.prettify()
if __name__ == "__main__":
test = Translate()
# test.translate()
# print(test.gethtml())
test.gethtml()
#coding=utf-8 #coding=utf-8
...@@ -25,7 +25,7 @@ from baseCore import BaseCore ...@@ -25,7 +25,7 @@ from baseCore import BaseCore
import configparser import configparser
from smart_extractor import SmartExtractor from smart_extractor import SmartExtractor
# baseCore=BaseCore()
class BaiduSpider(object): class BaiduSpider(object):
def __init__(self,searchkw,wordsCode,sid): def __init__(self,searchkw,wordsCode,sid):
...@@ -40,13 +40,15 @@ class BaiduSpider(object): ...@@ -40,13 +40,15 @@ class BaiduSpider(object):
port=self.config.get('redis', 'port'), port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0) password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1 self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver') # chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers') # self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver) # path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() # chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location') # chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) # proxy = baseCore.get_proxy()
# driver = webdriver.Chrome(chrome_options=chrome_options) # chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue() self.qtitle = Queue()
self.qurl = Queue() self.qurl = Queue()
self.detailList = Queue() self.detailList = Queue()
...@@ -54,14 +56,16 @@ class BaiduSpider(object): ...@@ -54,14 +56,16 @@ class BaiduSpider(object):
self.wordsCode = wordsCode self.wordsCode = wordsCode
self.sid = sid self.sid = sid
def createDriver(self): def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver') chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# 设置代理 proxy = baseCore.get_proxy()
# proxy = "127.0.0.1:8080" # 代理地址和端口 chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
#将列表数据插入到表中 meta_search_result #将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self,items):
try: try:
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -12,12 +12,16 @@ from kafka import KafkaProducer ...@@ -12,12 +12,16 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer from kafka import KafkaConsumer
import json import json
import itertools import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider from baiduSpider import BaiduSpider
import concurrent.futures import concurrent.futures
from baseCore import BaseCore from baseCore import BaseCore
from queue import Queue from queue import Queue
import configparser import configparser
from tqdm import tqdm
class BaiduTaskJob(object): class BaiduTaskJob(object):
def __init__(self): def __init__(self):
...@@ -39,7 +43,7 @@ class BaiduTaskJob(object): ...@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers], bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8'))) value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try: try:
for record in consumer: for record in tqdm(consumer, desc="Consuming messages"):
try: try:
logger.info("value:",record.value) logger.info("value:",record.value)
keymsg=record.value keymsg=record.value
...@@ -119,7 +123,15 @@ class BaiduTaskJob(object): ...@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
kwList=[] kwList=[]
if searchEngines: if searchEngines:
if '3' in searchEngines: if '3' in searchEngines:
keyword=keymsg['keyWord'] start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
keymsglist=self.getkeywords(keyword) keymsglist=self.getkeywords(keyword)
for kw in keymsglist: for kw in keymsglist:
kwmsg={ kwmsg={
...@@ -157,6 +169,25 @@ class BaiduTaskJob(object): ...@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
# finally: # finally:
# baiduSpider.driver.quit() # baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw) # logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg): def runSpider(self,kwmsg):
searchkw=kwmsg['kw'] searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode'] wordsCode=kwmsg['wordsCode']
...@@ -166,6 +197,8 @@ class BaiduTaskJob(object): ...@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
try: try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('百度搜索异常'+searchkw) logger.info('百度搜索异常'+searchkw)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -293,6 +293,7 @@ class BaseCore: ...@@ -293,6 +293,7 @@ class BaseCore:
sql = "select proxy from clb_proxy" sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql) self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall() proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = [] ip_list = []
for proxy_ in proxy_lists: for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", '')) ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
...@@ -304,8 +305,8 @@ class BaseCore: ...@@ -304,8 +305,8 @@ class BaseCore:
"port": str_ip_list[1], "port": str_ip_list[1],
} }
proxy = { proxy = {
"HTTP": proxyMeta, "http": proxyMeta,
"HTTPS": proxyMeta "https": proxyMeta
} }
proxy_list.append(proxy) proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)] return proxy_list[random.randint(0, 3)]
......
[redis] [redis]
...@@ -16,6 +16,8 @@ topic=keyWordsInfo ...@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId=python_baidu groupId=python_baidu
[selenium] [selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe ;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe ;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
# from baiduSpider import BaiduSpider
# from baiduSpider import BaiduSpider
# searchkw, wordsCode, sid = '', '', ''
# baidu = BaiduSpider(searchkw, wordsCode, sid)
import requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# title = '“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
# try:
# detailurl = url
# title = title
# content, contentWithTag = baidu.extractorMsg(detailurl, title)
# contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
# except Exception as e:
# content = ''
# contentWithTag = ''
#
#
# detailmsg = {
# 'title': title,
# 'detailurl': url,
# 'content': content,
# 'contentHtml': contentWithTag,
# }
# print(detailmsg)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
res_json = res.text
print(res_json)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论