提交 a38c9372 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -464,7 +464,8 @@ def zhengquanqihuo():
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse():
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
# url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_=1703469889542'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
......@@ -485,9 +486,13 @@ def sse():
# os.makedirs(path)
for page in range(0, int(total_page)):
t = int(time.time())
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=24278800487459370386559742313666&_={t}'
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_={t}'
data = policy.getrequest_json(headers, url_page)
newslist = data['data']['knowledgeList']
# if newslist:
# pass
# else:
# continue
# print(newslist)
for news in newslist:
num += 1
......@@ -521,8 +526,8 @@ def sse():
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
content += page.get_text()
for page_ in doc.pages():
content += page_.get_text()
file_href = newsUrl
file_name = title
......@@ -628,7 +633,7 @@ def sse():
for att_id in id_list:
baseCore.deliteATT(att_id)
except Exception as e:
log.info(f"error!!!{newsUrl}")
log.info(f"error!!!{newsUrl}===={title}")
log.info(e)
log.info(f'====第{page}页====处理结束,================')
......@@ -972,14 +977,14 @@ def guizhou():
if __name__=="__main__":
# file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path)
reform()
# shenzhen()
zhengquanqihuo()
# reform()
# # shenzhen()
# zhengquanqihuo()
try:
sse()
except:
pass
hebei()
guizhou()
# hebei()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
......@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__":
if __name__ == "__main__":
beijing()
reits.sse()
reits.reform()
......
......@@ -403,6 +403,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
self.cnx.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......@@ -472,6 +473,10 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
......@@ -658,6 +663,8 @@ class BaseCore:
return 'cn'
if result[0] == '':
return 'cn'
if result[0] == 'ja':
return 'jp'
return result[0]
#创建excel文件
......@@ -685,6 +692,10 @@ class BaseCore:
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值
......
......@@ -674,7 +674,7 @@ if __name__ == "__main__":
# BaseInfoEnterprise()
# FBS()
# MengZhi()
# NQEnterprise()
NQEnterprise()
# SEC_CIK()
# dujioashou()
# omeng()
......@@ -683,6 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task()
# FinanceFromEast()
# ipo_code()
JingyingfenxiFromEase()
# JingyingfenxiFromEase()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
......@@ -292,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict
# 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code:
# dic_info = baseCore.getInfomation(social_code)
......@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else:
# 开始采集
try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3)
......@@ -373,7 +373,7 @@ def ifbeforename(company_url):
return ''
# 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
......@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic)
sendkafka(aa_dic)
......@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic)
# 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = ''
try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
......@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
# 判断是否是曾用名
tr = tr_list[:1][0]
......@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
......@@ -549,6 +549,7 @@ if __name__ == '__main__':
else:
log.info('==========已无cookies==========')
time.sleep(30)
continue
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
......@@ -579,8 +580,8 @@ if __name__ == '__main__':
}
start_time = time.time()
# 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '91220101606092819L||'
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '913300007125582210||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name)
......@@ -595,6 +596,11 @@ if __name__ == '__main__':
while flag:
log.info('--------已没有数据---------')
time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field:
flag = False
......@@ -604,26 +610,28 @@ if __name__ == '__main__':
continue
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '')
ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21]
category = company_field.split('|')[19]
exchange = company_field.split('|')[20]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
# securitiesShortName = ''
# listingDate = ''
# category = ''
# exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
com_name = company_field.split('|')[1].replace(' ', '')
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21]
ynDomestic = '1'
countryName = '中国内地'
securitiesCode = ''
securitiesShortName = ''
listingDate = ''
category = ''
exchange = ''
listType = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(2)
# break
break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
......
......@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
# 判断是否是曾用名
tr = tr_list[:1][0]
......@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
......
import pandas as pd
# from pandas import DataFrame as df
import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df_all = pd.read_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', dtype=str)
list_com = []
for num_df in range(len(df_all)):
com_name = str(df_all['企业名称'][num_df])
dic_com = {
'social_code': '',
'com_name': com_name
}
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from sys_base_enterprise where name = %s '''
cursor.execute(sel_sql, com_name)
selects = cursor.fetchone()
if selects:
print(f'【{num_df}/{len(df_all)}】==={com_name}找到')
social_code = selects[0]
else:
print(f'【{num_df}/{len(df_all)}】==={com_name}未找到')
social_code = ''
df_all['信用代码'][num_df] = str(social_code)
df_all.to_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', index=False)
\ No newline at end of file
......@@ -28,7 +28,7 @@ headers = {
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
......
......@@ -38,7 +38,7 @@ headers = {
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
......@@ -70,7 +70,7 @@ def beinWork(tyc_code, social_code,start_time):
pass
except Exception as e:
#todo:重新放入redis中
baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode',social_code)
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code)
log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
......@@ -302,10 +302,11 @@ def doJob():
continue
id = data[0]
xydm = data[2]
com_name = data[1]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
retData = getTycIdByXYDM(com_name)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
......
......@@ -43,7 +43,7 @@ class EsMethod(object):
"must": [
{
"match": {
"type": "1"
"type": "0"
}
}
]
......@@ -115,7 +115,7 @@ def main(page, p, esMethod):
attid = mms['_source']['attachmentIds'][0]
log.info(f'{id}-{attid}--{title}--{sourceAddress}---')
selects = secrchATT('1', attid)
selects = secrchATT('4', attid)
if selects:
pass
else:
......
......@@ -228,7 +228,7 @@ def download(data, order_by):
'sid': sid,
'sourceAddress': sourceAddress,
'summary': summary,
'title': name_pdf,
'title': name_pdf.split('.pdf')[0],
'type': '0'
}
# 将相应字段通过kafka传输保存
......@@ -257,11 +257,11 @@ def download(data, order_by):
else:
log.info(f'====pdf解析失败====')
delete_url(sourceAddress)
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
# # 获取当前进程pid
# current_pid = baseCore.getPID()
# # todo: 重新启动新进程,杀死当前进程
# subprocess.Popen([sys.executable] + sys.argv)
# os.kill(current_pid, 9)
return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
......@@ -328,37 +328,152 @@ def download(data, order_by):
log.info(dic_result)
return
# def Mob():
# url = 'https://www.mob.com/mobData/report'
# res = requests.get(url=url,headers=headers).content
# soup = BeautifulSoup(res,'html.parser')
# max_info = soup.find('span',class_='el-pagination__total').text
# max_info = re.findall('\d{1,4}',max_info)[0]
# # print(type(max_info))
# max_page = int((int(max_info)/9) + 1)
# print(max_page)
# i_id = 0
# for page in range(max_page):
# url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
# res = requests.get(url=url, headers=headers).content
# soup = BeautifulSoup(res, 'html.parser')
# result = soup.find('ul', class_='fix')
# li_list = result.find_all('li')
# # for id in range(1, 149):
# id = i_id
# for li in li_list:
# id += 1
# title = li.find('div',class_='title').text
# time = li.find('div',class_='date tc').text.strip()
# year = re.findall('\d{4}',time)[0]
# # for id in range(29,178):
# real_id = 178 - id
# href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# # href = 'https://www.mob.com/mobdata/report/169'
# res_href = requests.get(url=href,headers=headers).content
# i_soup = BeautifulSoup(res_href,'html.parser')
# url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
# summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
# fin_summary = []
# for s in summary_list:
# summary = s.text
# fin_summary.append(summary)
# summary = ''.join(fin_summary)
# dic_post = {
# 'title': title, # 报告名称
# 'url_pdf': url_pdf, # 报告链接
# 'year': year, # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': time, # 时间
# 'origin': 'Mob研究院', # 来源
# 'sourceAddress': href, # 原文链接
# 'content': '', # 内容
# 'summary': summary, # 摘要
# 'sid': '1662008807781212161', # 信息源id
# }
# order_by = 1
# download(dic_post,order_by)
# order_by += 1
# # print(dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# i_id += 9
def Mob():
url = 'https://www.mob.com/mobData/report'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_info = soup.find('span',class_='el-pagination__total').text
max_info = re.findall('\d{1,4}',max_info)[0]
# print(type(max_info))
max_page = int((int(max_info)/9) + 1)
print(max_page)
i_id = 0
for page in range(max_page):
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
res = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find('ul', class_='fix')
li_list = result.find_all('li')
# for id in range(1, 149):
id = i_id
for li in li_list:
id += 1
title = li.find('div',class_='title').text
time = li.find('div',class_='date tc').text.strip()
year = re.findall('\d{4}',time)[0]
# for id in range(29,178):
real_id = 178 - id
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# href = 'https://www.mob.com/mobdata/report/169'
res_href = requests.get(url=href,headers=headers).content
# loginfo = baseCore.redicPullData('Mob:loginfo')
# account = loginfo.split('|')[0]
# password = loginfo.split('|')[1]
# usecount = loginfo.split('|')[2]
usecount = 0
# 测试用
account = '13636711746'
password = 'Zhenghao123'
# account = '18703752600'
# password = 'Axlk010208!'
# account = '13273737131'
# password = 'liu1230...'
# account = '15237560528'
# password = 'xlk123456!'
# account = '17103126138'
# password = '171BlackOne'
# account = '17103128590'
# password = '171BlackTwo'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
f_url = 'https://www.mob.com/developer/login'
browser.get(f_url)
browser.find_element(By.CLASS_NAME, 's1').click()
browser.find_element(By.CSS_SELECTOR, 'input[type="text"]').send_keys(f'{account}')
browser.find_element(By.CSS_SELECTOR, 'input[type="password"]').send_keys(f'{password}')
browser.find_element(By.XPATH, '//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span').click()
if usecount < 5:
pass
else:
return Mob()
# 获取登录的信息
# url = browser.current_url
# print(url)
url = 'https://www.mob.com/mobdata/report'
browser.get(url)
# tags = browser.find_elements(By.CLASS_NAME, 'main-title')
# for tag in tags:
# if 'Mob研究院' in tag.text:
# tag.click()
# else:
# continue
# # try:
# # web = tag.find_element(By.CLASS_NAME, "")
# # web.click()
# # break
# # except:
# # continue
cookies_list = browser.get_cookies()
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
# cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
# cookies_ = json.dumps(cookies)
session = requests.session()
session.cookies.update(cookies)
for i in range(5):
url = f'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
req = session.get(url=url, headers=headers)
data_json = req.json()
news_list = data_json['data']['list']
for info in news_list:
title = info['title']
publishDate = info['effective_date']
year = publishDate[:4]
report_id = info['report_id']
href = 'https://www.mob.com/mobdata/report/{}'.format(report_id)
# tf_url = add_check_url(href)
is_member = r.sismember('report_pdf_three_history', href)
if is_member:
continue
res_href = session.get(url=href, headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
headers['token'] = '05bc441a-b09b-40cb-ab65-8d9e63e5c529'
news_req = session.get(url=news_url,headers=headers)
pdf_url = news_req.json()['data']
fin_summary = []
for s in summary_list:
summary = s.text
......@@ -366,13 +481,13 @@ def Mob():
summary = ''.join(fin_summary)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'url_pdf': pdf_url, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'publishDate': publishDate, # 时间
'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
......@@ -382,12 +497,7 @@ def Mob():
order_by = 1
download(dic_post,order_by)
order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai():
......@@ -452,30 +562,83 @@ def yidong_guanxiangtai():
# print(res.json())
# 巨量算数
def juliangsuanshu():
browser = webdriver.Chrome(chromedriver)
# # 巨量算数
# def juliangsuanshu():
# # browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
#
# url = 'https://trendinsight.oceanengine.com/arithmetic-report'
# browser.get(url)#跳到指定页面
#
# page_source = browser.page_source#获取页面信息
# soup = BeautifulSoup(page_source, 'html.parser')
#
# list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
# for one_info in list_all:
# info_title = one_info.a.text.strip()
# info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
# info_href = one_info.a.get('href')
# info_url = 'https://trendinsight.oceanengine.com'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
# list_script = soup_info.find_all('script')
# for script in list_script:
# if 'window._SSR_DATA' in script.text:
# json_str = script.text
# info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
#
# info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
# info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '巨量算数', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': '', # 内容
# 'summary': info_zhaiyao, # 摘要
# 'sid': '1662008524476948481', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# browser.quit()
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
# 巨量算数
page_source = browser.page_source#获取页面信息
def getnews(browser):
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
list_all = soup.find('div', {'class': 'byted-loading byted-loading-block'}).find_all('div', {
'class': 'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'})
for one_info in list_all:
try:
info_title = one_info.a.text.strip()
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
info_date = one_info.find('div', {'class': 'releaseTime-MbbUaH'}).text.split(':')[1]
info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com'+info_href
info_url = 'https://trendinsight.oceanengine.com' + info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
soup_info = BeautifulSoup(res_info.content, 'html.parser')
list_script = soup_info.find_all('script')
for script in list_script:
if 'window._SSR_DATA' in script.text:
json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
info_json = json.loads(json_str.replace('window._SSR_DATA = ', ''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
......@@ -504,33 +667,97 @@ def juliangsuanshu():
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
except Exception as e:
continue
# todo:点击下一页
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
# try:
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# except:
# time.sleep(1)
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# return getnews(browser)
def juliangsuanshu():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
getnews(browser)
browser.quit()
def ke36switch(browser,info_url):
try:
browser.get(info_url) # 跳到指定页面
page_source = browser.page_source # 获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
return soup_info
except:
browser.quit()
proxy = baseCore.get_proxy()
# proxy = {
# 'http': '222.90.4.73:40018',
# 'httpS': '222.90.4.73:40018'
# }
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
browser.refresh()
ke36switch(browser,info_url)
# 36氪
def ke36():
# browser = webdriver.Chrome(chromedriver)
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://36kr.com/academe'
browser.get(url)#跳到指定页面
time.sleep(3)
for i in range(10):
try:
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'show-more')))
js = "var q=document.documentElement.scrollTop=3000"
browser.execute_script(js)
time.sleep(2)
browser.find_element(By.CLASS_NAME, 'show-more').click()
except:
break
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
for one_info in list_all:
for one_info in list_all[::-1]:
info_title = one_info.find('div',{'class':'title'}).text
info_zhaiyao = one_info.find('div',{'class':'desc'}).text
info_url = one_info.a.get('href')
# is_member = r.sismember('report_pdf_three_history', info_url)
# if is_member:
# continue
soup_info = ke36switch(browser,info_url)
browser.get(info_url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10]
info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
if info_date < '2023-05-10':
pass
else:
time.sleep(1)
continue
try:
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
except:
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
ke36switch(browser, info_url)
dic_post = {
'title': info_title, # 报告名称
'url_pdf': '', # 报告链接
......@@ -547,7 +774,7 @@ def ke36():
'sid': '1662008421217378306', # 信息源id
}
order_by = 1
download(dic_post, order_by)
# download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -555,17 +782,21 @@ def ke36():
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 前沿知识库
def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
for i in range(40,60):
log.info(f'====第{i}页====')
url = f'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
# list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
list_all = soup.find('div',{'class':'show_report_list'}).find_all('li')
for one_info in list_all:
info_title = one_info.a.get('title')
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
......@@ -664,7 +895,7 @@ def qianyanzhishiku():
def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128):
for i in range(1, 2):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
......@@ -672,6 +903,7 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面
time.sleep(5)
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息
......@@ -685,7 +917,12 @@ def shijiejingjiluntan():
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d')
# if info_date >= '2022-07-21':
# continue
try:
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
except:
info_zhaiyao = ''
try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except:
......@@ -726,6 +963,28 @@ def shijiejingjiluntan():
time.sleep(2)
browser.quit()
def get_json(key_word,page,headers):
param = {
"uid": "",
"keyword": key_word,
"type": ["researchReport"],
"client": "web",
"clientVersion": "curr",
"clientType": "web",
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url, headers=headers).text[1:-1]
res_json = json.loads(res)
return res_json
# 东方财富网
def dongfangcaifu():
headers = {
......@@ -769,26 +1028,12 @@ def dongfangcaifu():
page = 1
# for page in range(1,500):
# log.info(page)
param = {
"uid": "",
"keyword": key_word,
"type": ["researchReport"],
"client": "web",
"clientVersion": "curr",
"clientType": "web",
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url,headers=headers).text[1:-1]
res_json = json.loads(res)
res_json_ = get_json(key_word, page, headers)
# 添加页数
total = res_json_['hitsTotal']
page = (total/10) + 1
for page_ in range(1,page+1):
res_json = get_json(key_word,page_,headers)
list_all = res_json['result']['researchReport']
if list_all:
......@@ -847,23 +1092,6 @@ def dongfangcaifu():
order_by = 1
download(dic_post, order_by)
order_by += 1
# log.info(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# log.info(res.json())
# dic_news = {
# '关键字':key_word,
# '标题':news_title,
# '时间':news_date,
# '来源':news_come,
# '摘要':news_content,
# '原文链接':news_url,
# 'PDF链接':news_pdf,
# }
# list_all_info.append(dic_news)
# if len(list_all) != 10:
# break
# 东方财富网2
def dongfangcaifu2():
......@@ -1397,7 +1625,7 @@ if __name__ == '__main__':
# try:
# log.info('mob')
# Mob()
# except:
# except Exception as e:
# pass
# try:
# log.info('yidong_guanxiangtai')
......@@ -1407,24 +1635,25 @@ if __name__ == '__main__':
# try:
# log.info('juliangsuanshu')
# juliangsuanshu()
# except:
# except Exception as e:
# pass
# try:
# log.info('ke36')
# ke36()
# except:
# except Exception as e:
# ke36()
# pass
# try:
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# except:
# pass
# try:
# log.info('shijiejingjiluntan')
# shijiejingjiluntan()
# except Exception as e:
# log.info(e)
# pass
try:
log.info('shijiejingjiluntan')
shijiejingjiluntan()
except Exception as e:
log.info(e)
pass
# try:
# log.info('dongfangcaifu')
# dongfangcaifu()
......@@ -1442,31 +1671,31 @@ if __name__ == '__main__':
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu4')
# dongfangcaifu4()
# except Exception as e:
# log.info(e)
# pass
try:
log.info('dongfangcaifu5')
dongfangcaifu5()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu6')
dongfangcaifu6()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu7')
dongfangcaifu7()
except Exception as e:
log.info(e)
pass
#
# try:
# log.info('dongfangcaifu5')
# dongfangcaifu5()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu6')
# dongfangcaifu6()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu7')
# dongfangcaifu7()
# except Exception as e:
# log.info(e)
# pass
......@@ -53,12 +53,12 @@ class EsMethod(object):
# 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
resultb = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
return resultb
def updateaunn(self, index_name, id, content, contentWithTag):
body = {
......@@ -67,24 +67,28 @@ class EsMethod(object):
'contentWithTag': contentWithTag
}
}
result = self.es.update(index=index_name
resulta = self.es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
log.info('更新结果:%s' % resulta)
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
print(len(links))
# 遍历标签,将相对地址转换为绝对地址
for link in links:
print(link)
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
# link['href'] = urljoin(listurl, link['href'])
pass
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
pass
# link['src'] = urljoin(listurl, link['src'])
return html
def get_news(news_url,ip_dic):
def get_news(news_url,sourceAddress,id):
header = {
'Host': 'www.sec.gov',
'Connection': 'keep-alive',
......@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic):
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
}
response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
response = requests.get(url=news_url,headers=header,verify=False)
# aa = response.text
# print(response.text)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
result = BeautifulSoup(response.content,'html.parser')
# result_ = BeautifulSoup(response.content,'html.parser')
result_ = BeautifulSoup(response.text, 'lxml')
# print(result)
pass
else:
# 请求失败,输出错误信息
log.info('请求失败:', response.status_code, response.text)
result = ''
return result
result_ = ''
if result_:
pass
# 相对路径转化为绝对路径
# soup = paserUrl(result_, sourceAddress)
time.sleep(2)
content = result_.text.strip()
# del(result_)
# content = result_
# print(content)
time.sleep(2)
esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))
def main(esMethod):
redis_conn = redis.Redis(connection_pool=pool)
id_ = redis_conn.lpop('NianbaoUS:id')
id = id_.decode()
# id = "23101317164"
if id:
if id_:
pass
else:
log.info('已无数据')
return
return False
id = id_.decode()
result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
result = result_['hits']['hits'][0]
num = 0
......@@ -135,17 +153,8 @@ def main(esMethod):
log.info(f'====={title}=={social_code}===正在更新===')
sourceAddress = result['_source']['sourceAddress']
ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
soup = get_news(sourceAddress,ip_dic)
if soup:
pass
else:
return
# 相对路径转化为绝对路径
soup = paserUrl(soup, sourceAddress)
content = soup.text.strip()
esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
return
get_news(sourceAddress,sourceAddress,id)
return True
def run_threads(num_threads,esMethod):
......@@ -164,6 +173,9 @@ if __name__ == '__main__':
while True:
esMethod = EsMethod()
start = time.time()
num_threads = 5
run_threads(num_threads,esMethod)
log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file
# num_threads = 5
# run_threads(num_threads,esMethod)
# log.info(f'5线程 总耗时{time.time()-start}秒')
result = main(esMethod)
if not result:
break
\ No newline at end of file
# 证监会沪市、gong深市 公司债券和企业债券采集
"""
证监会企业名单
"""
# 证监会沪市、深市 公司债券和企业债券采集
import time
import random
import requests
......@@ -25,7 +22,7 @@ cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
taskType = '企业名单/证监会'
taskType = '企业债券/证监会'
def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe'
......@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信
page = soup.find('div', class_='pages').find_all('li')[-1]
total = page.find('b').text
for i in range(1,int(total)+1):
# for i in range(1,int(total)+1):
for i in range(224, 225):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
......@@ -241,7 +239,7 @@ if __name__ == '__main__':
# url_parms = ['201010', '201014']
# url_parms = ['201011', '201013']
url_parms = ['201411', '201414', '202011', '202014']
# url_parms = ['202011', '202014']
# url_parms = ['201411']
for url_parm in url_parms:
url = getUrl(url_parm)
......
import yfinance as yf
# 获取股票数据
stock = yf.Ticker("MET")
# 获取资产负债表数据
balance_sheet = stock.balance_sheet
# 获取报告日期
report_dates = balance_sheet.index
print(report_dates)
# 获取现金流量表数据
cashflow_statement = stock.cashflow
# 获取利润表数据
income_statement = stock.financials
print(balance_sheet)
print(cashflow_statement)
print(income_statement)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
......@@ -57,8 +57,8 @@ def page_list():
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
......@@ -69,11 +69,11 @@ def page_list():
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,453):
for i in range(1,2):
log.info(f'采集第{i}页数据')
k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
......@@ -110,8 +110,8 @@ def detailpaser(dmsg):
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
......@@ -122,7 +122,7 @@ def detailpaser(dmsg):
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
try:
......
import json
import time
import uuid
import pymysql
import redis
import requests
from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings()
from obs import ObsClient
import fitz
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = baseCore.cnx_
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
pathType = 'CrowDingZhi/'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 数据入库,返回主键id传到kafka中
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
# log.info(values)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
# cnx.close()
# log.info("更新完成:{}".format(pdf_id))
return pdf_id
def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
# response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
response = requests.get(pdf_url)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(getuuid()) + '.' + category
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
if category == 'pdf':
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
else:
page_size = 0
retData['content'] = ''
break
except Exception as e:
time.sleep(3)
continue
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
log.info(f'error---{e}')
return retData
return retData
if __name__ == "__main__":
num = 0
t = int(time.time()*1000)
url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req_ = requests.get(url=url_, headers=headers)
data_json = req_.json()
print(data_json)
pageCount = data_json['pageHelp']['pageCount']
for i in range(1,int(pageCount + 1)):
url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req = requests.get(url=url, headers=headers)
data_list = req.json()['result']
for info in data_list:
publishDate = info['cmsOpDate'] # 处理日期
year = publishDate[:4]
com = '上海证券交易所'
docTitle = info['docTitle'] # 处理事由
docType = info['docType'] # 文档类型
docURL = "http://" + info['docURL'] # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
flag = r.sismember('IN-20231227-0001', docURL)
if flag:
log.info('信息已采集入库过')
continue
# 上传至obs
retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
path = retData['path']
full_path = retData['full_path']
file_size = retData['file_size']
create_by = retData['create_by']
content = retData['content']
status = 1
num += 1
create_time = time_now
# 上传到附件表
att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
if att_id:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sid = '1739914218978594817'
info_code = "IN-20231227-0001"
dic_news = {
'attachmentIds': str(att_id),
'content': content,
'contentWithTag': '',
'id': '',
'origin': com,
'publishDate': publishDate,
'sid': sid,
'sourceAddress': docURL,
'title': docTitle,
'source':'16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r.sadd(info_code, docURL)
continue
# 中央全面深化改革委员会会议
import json
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'www.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
soup = BeautifulSoup(request.content, 'html.parser')
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
year = int(publishDate[:4])
if year < 2023:
continue
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
'origin': source,
'contentWithTag': str(contentwithTag),
'content': content,
'summary': summary,
'publishDate': publishDate,
'sid': sid,
'subjectId': '1681549361661489154',
'sourceAddress':newsUrl,
'checkStatus': 1,
'deleteFlag': 0,
'createDate': time_now,
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
finally:
producer.close()
\ No newline at end of file
......@@ -27,29 +27,19 @@ class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'policy'
self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum):
body = {
"query": {
"bool": {
"must": [
{
"nested" : {
"query" : {
"bool" : {
"must" : [
{
"match_phrase" : {
"labels.relationId" : {
"query" : "1698"
}
}
"term": {
"sid.keyword": {
"value": "1662008524476948481"
}
]
}
},
"path" : "labels"
}
}
]
......@@ -112,7 +102,7 @@ def main(page, p, esMethod):
unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
# 删除重复的文档
for doc_id in unique_document_ids:
esMethod.delete(index_name="policy", id=doc_id)
esMethod.delete(index_name="researchreportdata", id=doc_id)
......
......@@ -121,7 +121,7 @@ def get_content2():
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
if '.wps' in file_href or '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
......
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/国内上市'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
social_code = '91130629MA0CG2DL51'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
\ No newline at end of file
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/中国500强'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:zg500shSocial_code')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
......@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(10)
time.sleep(20)
s = requests.session()
#获取到token和cookies
......
......@@ -239,6 +239,8 @@ if __name__=="__main__":
list_all_info = []
while True:
#一次拿取一篇文章
# todo: 从redis拿数据 更新mysql状态
dict_json =getjsonInfo()
if dict_json:
if get_info(dict_json):
......
......@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page):
cnx_.commit()
except Exception as e:
log.error(f"保存数据库失败:{e}")
# todo: 放入redis
log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount==0:
#列表为空认为结束
......
from bs4 import BeautifulSoup
import requests,time,re
from base import BaseCore
# import pandas as pd
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
log = baseCore.getLogger()
taskType = '500强专利'
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
# df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
def name_handle(english_name_):
if 'INC.' in english_name_ or 'LTD.' in english_name_ or 'CO.' in english_name_ \
or 'CORP.' in english_name_ or 'GMBH' in english_name_ \
or ' AG' in english_name_ or 'SARL' in english_name_ or 'S.A.' in english_name_ \
or 'PTY' in english_name_ or 'LLC' in english_name_ or 'LLP' in english_name_ \
or ' AB' in english_name_ or ' NV' in english_name_ or 'N.V.' in english_name_ \
or 'A.S.' in english_name_ or ' SA' in english_name_ or ',Limited' in english_name_ \
or ' SE' in english_name_ or ' LPC' in english_name_ or 'S.P.A.' in english_name_:
english_name = english_name_.replace('INC.', '').replace('LTD.', '').replace('CO.', '').replace('CORP.', '') \
.replace('GMBH', '').replace(' AG', '').replace('SARL', '').replace('S.A.', '').replace('PTY', '') \
.replace('LLC', '').replace('LLP', '').replace(' AB', '').replace(' NV', '').replace(',', '') \
.replace('A.S.', '').replace(' SA', '').replace(',Limited', '').replace(' SE', '').replace(' PLC', '') \
.replace('N.V.', '').replace('S.P.A.', '').rstrip()
return english_name
else:
english_name = english_name_
return english_name
if __name__ == '__main__':
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gwSocial_code')
# social_code = '9111000071093123XX'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
english_name_ = data[5]
place = data[6]
if place == 1:
log.info(f'{com_name}--国内')
baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
continue
if english_name_:
pass
else:
query = f"select * from sys_base_enterprise where social_credit_code ='{xydm}'"
cursor_.execute(query)
reslut = cursor_.fetchone()
english_name_ = reslut[32]
# todo:将该字段更新到144企业库
update_ = f"update EnterpriseInfo set EnglishName='{english_name_}' where SocialCode='{xydm}' "
cursor.execute(update_)
cnx.commit()
english_name_ = english_name_.upper()
english_name = name_handle(english_name_)
num_zhuanli = 0
# url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
# int(float(time.time()) * 1000))
#
# res1 = requests.get(url1, headers=headers)
# soup1 = BeautifulSoup(res1.content, 'html.parser')
#
# num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
#
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
# if zhuanli:
for year in range(2023, 1900, -1):
url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={english_name}&IN=&CPC=&IC=&rnd=' + str(
int(float(time.time()) * 1000))
# url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli == 0:
dic_info = {
'com_name': com_name,
'social_code': social_code,
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-----已经存在--{year}--无专利信息")
break
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}------新增----无专利信息")
break
dic_info = {
'com_name': com_name,
'social_code': social_code,
'year': year,
'num': zhuanli
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
except:
log.info("error!{}".format(social_code))
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
\ No newline at end of file
import requests,re,time,os,datetime,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import redis
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
for i in range(len(df_all['英文名称'])):
for num in range(0, 2):
try:
if '中国' not in df_all['企业所属国家'][i]:
com_name = df_all['英文名称'][i]
num_zhuanli = 0
url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
int(float(time.time()) * 1000))
res1 = requests.get(url1, headers=headers)
soup1 = BeautifulSoup(res1.content, 'html.parser')
num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
zhuanli = '10000'
if zhuanli == '10000':
for year in range(2023, 1900, -1):
# url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={com_name}&IN=&CPC=&IC=&rnd=' + str(
# int(float(time.time()) * 1000))
url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli2 = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli2 == 0:
break
df_all[f'{year}'][i] = zhuanli2
# num_zhuanli = num_zhuanli + zhuanli2
num_zhuanli = num_zhuanli + zhuanli2
print(year)
time.sleep(random.uniform(1.5, 2))
else:
num_zhuanli = int(zhuanli)
time.sleep(random.uniform(1.5, 2))
df_all['Espacenet专利检索'][i] = num_zhuanli
print(f"{com_name} : {num_zhuanli}")
break
except:
if num == 0:
print("重试")
time.sleep(60)
continue
else:
print("error!{}".format(df_all['英文名称'][i]))
\ No newline at end of file
import requests,time,re,random
import functools
import random
import threading
import traceback
import pymysql
import requests,time
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
import concurrent.futures
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
# cnx = baseCore.cnx
# cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
taskType = '天眼查专利/国内榜单'
# 需调整放入国外和国内的redis
# 设置一个全局变量用于控制线程退出
should_exit = False
def connectSql():
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
return cnx,cursor
#关闭数据库连接
def closeSql(cnx,cursor):
cnx.close()
cursor.close()
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
# 获取代理
def get_proxy():
cnx,cursor = connectSql()
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
cnx.commit()
closeSql(cnx,cursor)
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 4)]
def spider_zhuanli(com_name, social_code, tycid):
page = 1
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
......@@ -36,13 +78,14 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
while True:
log.info(f'===正在处理第{page}页===')
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
ip = get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
ip = get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
......@@ -53,8 +96,38 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
time.sleep(2)
continue
# print(res_j)
try:
list_all = res_j['data']['items']
# print(list_all)
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
cnx, cursor = connectSql()
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
# lock.acquire()
cursor.execute(selectSql)
count = cursor.fetchone()[0]
closeSql(cnx, cursor)
# lock.release()
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
break
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
cnx, cursor = connectSql()
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
# lock.acquire()
cursor.execute(insertSql, values_tuple)
cnx.commit()
# lock.release()
closeSql(cnx,cursor)
log.info(f"{com_name}---{social_code}---新增---无专利")
log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
break
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
......@@ -115,36 +188,42 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
cnx, cursor = connectSql()
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
# lock.acquire()
cursor.execute(selectSql)
count = cursor.fetchone()[0]
# lock.release()
closeSql(cnx,cursor)
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
cnx,cursor = connectSql()
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
closeSql(cnx,cursor)
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
page+=1
else:
return 0
log.info(f"---{social_code}----{tycid}--共{page}页--结束处理")
break
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# social_code = '9111010566840059XP'
def runSpider():
# 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
social_code = '91360400794798498A'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
# 任务执行结束后设置should_exit为True
global should_exit
should_exit = True
start = time.time()
try:
data = baseCore.getInfomation(social_code)
......@@ -153,50 +232,48 @@ if __name__ == "__main__":
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
continue
return False
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
place = data[6]
if place != 1:
baseCore.rePutIntoR('Zhuanli:gwSocial_code', social_code)
return False
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
retData = getTycIdByXYDM(social_code)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cnx,cursor = connectSql()
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{social_code}'"
cursor.execute(updateSql)
cnx.commit()
closeSql(cnx,cursor)
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
return False
elif not retData['reput'] and not retData['tycData']:
continue
return False
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
return False
log.info(f"{id}---{social_code}----{tycid}----开始处理")
spider_zhuanli(com_name, social_code, tycid)
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
traceback.print_exc()
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
......@@ -204,3 +281,56 @@ if __name__ == "__main__":
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
finally:
# global should_exit
# should_exit = True
return
# if __name__ == "__main__":
# while True:
# # 创建一个线程池,指定线程数量为4
# with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# results = []
# while True:
# # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# # social_code = '91350700856994874M'
# # 判断 如果Redis中已经没有数据,则等待
# if social_code == None:
# # time.sleep(20)
# break
#
# future = executor.submit(runSpider, social_code)
# results.append(future)
# # 获取任务的执行结果
# for future in concurrent.futures.as_completed(results):
# try:
# result = future.result()
# # 处理任务的执行结果
# print(f"任务执行结束: {result}")
# except Exception as e:
# # 处理任务执行过程中的异常
# # print(f"任务执行exception: {e}")
# traceback.print_exc()
def run_threads(num_threads):
threads = []
for i in range(num_threads):
thread = threading.Thread(target=runSpider)
threads.append(thread)
thread.start()
# while True:
# if should_exit:
# break
for thread in threads:
thread.join()
if __name__ == '__main__':
while True:
start = time.time()
num_threads = 1
run_threads(num_threads)
log.info(f'5线程 总耗时{time.time()-start}秒')
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
try:
list_all = res_j['data']['items']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
return 0
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}---{social_code}---新增---无专利")
return 0
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code_zg500')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
......@@ -13,9 +13,10 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu_test
groupId=python_google
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
......@@ -168,6 +168,8 @@ class GoogleSpider(object):
try:
driver.get(url)
# 等待页面加载完成
time.sleep(3)
driver.refresh()
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
html=driver.page_source
......@@ -256,6 +258,7 @@ class GoogleSpider(object):
self.driver.get(self.url)
# 等待页面加载完成
time.sleep(3)
self.driver.refresh()
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
search_input = self.driver.find_element('xpath', '//textarea[@title="Google 搜索"]')
......@@ -265,7 +268,11 @@ class GoogleSpider(object):
time.sleep(3)
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
try:
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click()
except:
self.driver.find_element('xpath', '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span').click()
time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
time.sleep(2)
......@@ -273,7 +280,8 @@ class GoogleSpider(object):
time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e:
print(e)
self.logger.info(f'--点击按钮失效----{e}')
return
self.logger.info("开始抓取首页..." + self.searchkw )
time.sleep(5)
flag, lists = self.parse_page()
......@@ -446,7 +454,7 @@ class GoogleSpider(object):
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
......
......@@ -40,7 +40,7 @@ class GoogleTaskJob(object):
try:
for record in consumer:
try:
logger.info("value:",record.value)
logger.info(f"value:{record.value}")
keymsg=record.value
if keymsg:
break
......@@ -176,7 +176,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
import requests
url = 'https://www.ctwant.com/article/308534'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
req = requests.get(url,headers)
print(req.text)
\ No newline at end of file
......@@ -113,23 +113,23 @@ if __name__=='__main__':
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except:
continue
# if len(author)>4:
# continue
if len(author)>4:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# if '(' in author or '本刊' in author \
# or '记者' in author or '新闻社' in author \
# or '”' in author\
# or '大学' in author or '洛桑江村' in author:
# continue
if '国资委党委' in author:
pass
else:
if '(' in author or '本刊' in author \
or '记者' in author or '新闻社' in author \
or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
# if '国资委党委' in author:
# pass
# else:
# continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
......@@ -165,7 +165,7 @@ if __name__=='__main__':
}
log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href)
r.sadd('qiushileaderspeech_two::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
......@@ -55,56 +55,56 @@ from obs import ObsClient
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def use_ocr(img):
ocr = ddddocr.DdddOcr()
with open(img, 'rb') as f:
image = f.read()
res = ocr.classification(image)
print(res)
return res
if __name__=="__main__":
requests.DEFAULT_RETRIES = 5
time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'开始时间为:{time_start}')
requests.adapters.DEFAULT_RETRIES = 3
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "http://zxgk.court.gov.cn/shixin/"
browser.get(url)
# 可改动
time.sleep(20)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')
img_url = soup.select('img[id="captchaImg"]')[0]['src']
browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
browser.find_element(By.ID, 'yzm').send_keys(yzm)
browser.find_element(By.ID, 'searchForm').click()
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# baseCore = BaseCore()
# log = baseCore.getLogger()
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
#
# def use_ocr(img):
# ocr = ddddocr.DdddOcr()
# with open(img, 'rb') as f:
# image = f.read()
# res = ocr.classification(image)
# print(res)
# return res
#
# if __name__=="__main__":
# requests.DEFAULT_RETRIES = 5
# time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# log.info(f'开始时间为:{time_start}')
#
# requests.adapters.DEFAULT_RETRIES = 3
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
# }
#
# opt = webdriver.ChromeOptions()
# opt.add_argument(
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
#
# opt.add_argument("--ignore-certificate-errors")
# opt.add_argument("--ignore-ssl-errors")
# opt.add_experimental_option("excludeSwitches", ["enable-automation"])
# opt.add_experimental_option('excludeSwitches', ['enable-logging'])
# opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
# chromedriver = r'D:/cmd100/chromedriver.exe'
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
# url = "http://zxgk.court.gov.cn/shixin/"
# browser.get(url)
# # 可改动
# time.sleep(20)
# page_source = browser.page_source
# soup = BeautifulSoup(page_source, 'html.parser')
# img_url = soup.select('img[id="captchaImg"]')[0]['src']
#
# browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
#
#
# browser.find_element(By.ID, 'yzm').send_keys(yzm)
# browser.find_element(By.ID, 'searchForm').click()
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# screen_img_path = "D:/screen/xxx.png"
# out_img_path = "D:/out/xxx.png"
......@@ -112,3 +112,27 @@ if __name__=="__main__":
#
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
import requests
headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
with open('./a.pdf','wb') as f:
f.write(res.content)
\ No newline at end of file
#百度翻译 不登录翻译1000字 登录翻译5000字
#百度翻译 不登录翻译1000字 登录翻译5000字
import re
import string
import time
import pymongo
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
""""
initialize the class, and include the fundamental attributes
"""
# self._lang_list = ['zh', 'en', 'kor', 'fra', 'jp', 'el', 'ru']
# self._lang_list_original = ["中文", "英语", "韩语", "法语", "日语", "希腊语", "俄语"]
# self._num = len(self._lang_list)
self.url = "https://fanyi.baidu.com/#{}/{}/{}"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软['数据源_0106']
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def translate(self, sentence, browser, lang):
sentence_ = sentence
# browser = self.createDriver()
wait = WebDriverWait(browser, 20)
try:
word_type = self.get_input_language_type(sentence_, browser, wait)
except:
browser.quit()
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
if word_type:
if word_type == lang:
pass
else:
word_type = lang
url = self.url.format(word_type, 'zh', sentence_)
browser.set_page_load_timeout(10)
try:
browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
print(f'翻译后的句子:{result}')
return result, browser
except:
browser.quit()
print(f'翻译失败,重新翻译。当前句子为{sentence_}')
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
def get_input_language_type(self, word, browser, wait):
browser.get("https://fanyi.baidu.com/")
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
word_type = word_type.get_attribute("data-lang")
return word_type
def is_punctuation(self, char):
punctuation = string.punctuation + '、' + '(' + '…' + ')' + '《' + '》' + '“' + '”' + ':' + ';' + '!' + ' ' + '。'
return char in punctuation
def sentence_split_sentence(self, contentWithTag):
pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
match_group = pattern.finditer(contentWithTag)
sentences = []
if match_group:
for _ in match_group:
start_end_index = _.span()
sentences.append((start_end_index[0], start_end_index[1], _.group()))
if (not sentences) and (len(contentWithTag) >= 4):
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self):
# data = self.db_storage.find_one({'titleForeign':{'$ne':''}})
try:
browser = self.createDriver()
except:
browser = self.createDriver()
datas = self.db_storage.find({'postCode': '2', 'newsTime': {'$gte': '2024-01-01', '$lt': '2024-01-02'}}).limit(10)
for data in datas:
contentWithTag = data['richTextForeign']
# 根据分段符\n拆分,拿取纯文本,翻译
# # 拆分成段
# # pattern1 = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
# sentence_list = self.sentence_split_sentence(contentWithTag)
# print(sentence_list)
# # 每段拆分成标签
# result_list = []
# # for sentence_tag in tqdm(sentence_list):
# sentence_xml = BeautifulSoup(sentence_tag[2], 'lxml')
# for tag in sentence_xml.find_all(text=True):
# sentence =
# if len(sentence.strip()) == 0:
# # # print(f'aa当前内容为:{sentence}')
# result = sentence.strip()
# sentence_xml.text.replace(sentence, result)
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": result,
# "sentence_xml": sentence_xml
# })
# elif self.is_punctuation(sentence.strip()) or len(sentence.strip()) == 1:
# # # print(f'bb当前内容为:{sentence}')
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": sentence,
# "sentence_xml": sentence_xml
# })
# else:
# # 翻译文本
# result = self.translate(sentence)
# new_xml = sentence_tag[2].replace(sentence, result)
#
# result_list.append({
# "start_index": sentence_tag[0],
# # "sentence": sentence + "\n",
# "sentence": result,
# "sentence_xml": new_xml
# })
# # todo: 对内容进行排序,保证顺序对
# sorted_context_list = sorted(result_list, key=lambda x: x["start_index"])
# final_list = [item["sentence_xml"] for item in sorted_context_list]
#
# return f'\n'.join(final_list)
# paragraphs = self.jionstr(contentWithTag)
html = BeautifulSoup(contentWithTag, 'html.parser')
content = html.text
lang = baseCore.detect_language(content)
for tag in html.find_all(text=True):
sentence = str(tag)
# sentence = " 実際に働き手の数が8がけ(8割)になる16年後、介護のようなケアサービスを今のような形で受けることは困難になると予測される。"
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# if len(sentence) > 1000:
if len(sentence) > 50:
print(len(sentence))
# index_1000 = sentence[999]
index_1000 = sentence[49]
# 判断该字符是不是逗号或句号
if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == ',':
# 如果是标点符号
# print(f'当前的段1:{sentence[:1000]}')
print(f'当前的段1:{sentence[:50]}')
# result1, browser = self.translate(sentence[:1000].strip(), browser, lang)
result1, browser = self.translate(sentence[:50].strip(), browser, lang)
# print(f'当前的段2:{sentence[1000:]}')
print(f'当前的段2:{sentence[50:]}')
# result2, browser = self.translate(sentence[1000:].strip(), browser, lang)
result2, browser = self.translate(sentence[50:].strip(), browser, lang)
tag.replace_with(result1+result2)
else:
# 如果不是标点符号
# i = 1000
i = 50
while i >= 0:
j = i-1
if j <= 0:
break
index_punctuation = sentence[j]
if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == ',':
print(f'当前的段3:{sentence[:j+1]}')
result1, browser = self.translate(sentence[:j+1].strip(), browser, lang)
print(f'当前的段4:{sentence[j+1:]}')
result2, browser = self.translate(sentence[j+1:].strip(), browser, lang)
tag.replace_with(result1+result2)
break
else:
i = j
continue
if i == 1:
print(f'当前的段5:{sentence}')
# result, browser = self.translate(sentence[:1000].strip(), browser, lang)
result, browser = self.translate(sentence[:50].strip(), browser, lang)
tag.replace_with(result)
continue
else:
# 翻译
print(f'当前的段6:{sentence}')
result, browser = self.translate(sentence, browser, lang)
# 替换
tag.replace_with(result)
time.sleep(2)
print(html.prettify())
# return html.prettify()
if __name__ == "__main__":
test = Translate()
# test.translate()
# print(test.gethtml())
test.gethtml()
#coding=utf-8
#coding=utf-8
......@@ -25,7 +25,7 @@ from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
# baseCore=BaseCore()
class BaiduSpider(object):
def __init__(self,searchkw,wordsCode,sid):
......@@ -40,13 +40,15 @@ class BaiduSpider(object):
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
# chrome_driver =self.config.get('selenium', 'chrome_driver')
# self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# path = Service(chrome_driver)
# chrome_options = webdriver.ChromeOptions()
# chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# proxy = baseCore.get_proxy()
# chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
......@@ -54,14 +56,16 @@ class BaiduSpider(object):
self.wordsCode = wordsCode
self.sid = sid
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -12,12 +12,16 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
from tqdm import tqdm
class BaiduTaskJob(object):
def __init__(self):
......@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
for record in tqdm(consumer, desc="Consuming messages"):
try:
logger.info("value:",record.value)
keymsg=record.value
......@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
......@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
......@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html()
except Exception as e:
try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -293,6 +293,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......@@ -304,8 +305,8 @@ class BaseCore:
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
......
[redis]
[redis]
......@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
# from baiduSpider import BaiduSpider
# from baiduSpider import BaiduSpider
# searchkw, wordsCode, sid = '', '', ''
# baidu = BaiduSpider(searchkw, wordsCode, sid)
import requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# title = '“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
# try:
# detailurl = url
# title = title
# content, contentWithTag = baidu.extractorMsg(detailurl, title)
# contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
# except Exception as e:
# content = ''
# contentWithTag = ''
#
#
# detailmsg = {
# 'title': title,
# 'detailurl': url,
# 'content': content,
# 'contentHtml': contentWithTag,
# }
# print(detailmsg)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
res_json = res.text
print(res_json)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论