提交 a38c9372 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -464,7 +464,8 @@ def zhengquanqihuo():
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse():
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
# url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_=1703469889542'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
......@@ -485,9 +486,13 @@ def sse():
# os.makedirs(path)
for page in range(0, int(total_page)):
t = int(time.time())
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=24278800487459370386559742313666&_={t}'
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_={t}'
data = policy.getrequest_json(headers, url_page)
newslist = data['data']['knowledgeList']
# if newslist:
# pass
# else:
# continue
# print(newslist)
for news in newslist:
num += 1
......@@ -521,8 +526,8 @@ def sse():
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
content += page.get_text()
for page_ in doc.pages():
content += page_.get_text()
file_href = newsUrl
file_name = title
......@@ -628,7 +633,7 @@ def sse():
for att_id in id_list:
baseCore.deliteATT(att_id)
except Exception as e:
log.info(f"error!!!{newsUrl}")
log.info(f"error!!!{newsUrl}===={title}")
log.info(e)
log.info(f'====第{page}页====处理结束,================')
......@@ -972,14 +977,14 @@ def guizhou():
if __name__=="__main__":
# file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path)
reform()
# shenzhen()
zhengquanqihuo()
# reform()
# # shenzhen()
# zhengquanqihuo()
try:
sse()
except:
pass
hebei()
guizhou()
# hebei()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
......@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__":
if __name__ == "__main__":
beijing()
reits.sse()
reits.reform()
......
......@@ -403,6 +403,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
self.cnx.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......@@ -472,6 +473,10 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
......@@ -658,6 +663,8 @@ class BaseCore:
return 'cn'
if result[0] == '':
return 'cn'
if result[0] == 'ja':
return 'jp'
return result[0]
#创建excel文件
......@@ -685,6 +692,10 @@ class BaseCore:
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值
......
......@@ -674,7 +674,7 @@ if __name__ == "__main__":
# BaseInfoEnterprise()
# FBS()
# MengZhi()
# NQEnterprise()
NQEnterprise()
# SEC_CIK()
# dujioashou()
# omeng()
......@@ -683,6 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task()
# FinanceFromEast()
# ipo_code()
JingyingfenxiFromEase()
# JingyingfenxiFromEase()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
......@@ -292,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict
# 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code:
# dic_info = baseCore.getInfomation(social_code)
......@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else:
# 开始采集
try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3)
......@@ -373,7 +373,7 @@ def ifbeforename(company_url):
return ''
# 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
......@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic)
sendkafka(aa_dic)
......@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic)
# 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = ''
try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
......@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
# 判断是否是曾用名
tr = tr_list[:1][0]
......@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
......@@ -549,6 +549,7 @@ if __name__ == '__main__':
else:
log.info('==========已无cookies==========')
time.sleep(30)
continue
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
......@@ -579,8 +580,8 @@ if __name__ == '__main__':
}
start_time = time.time()
# 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '91220101606092819L||'
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '913300007125582210||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name)
......@@ -595,6 +596,11 @@ if __name__ == '__main__':
while flag:
log.info('--------已没有数据---------')
time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field:
flag = False
......@@ -604,26 +610,28 @@ if __name__ == '__main__':
continue
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '')
ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21]
category = company_field.split('|')[19]
exchange = company_field.split('|')[20]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
# securitiesShortName = ''
# listingDate = ''
# category = ''
# exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
com_name = company_field.split('|')[1].replace(' ', '')
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21]
ynDomestic = '1'
countryName = '中国内地'
securitiesCode = ''
securitiesShortName = ''
listingDate = ''
category = ''
exchange = ''
listType = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(2)
# break
break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
......
......@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
# 判断是否是曾用名
tr = tr_list[:1][0]
......@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
......
import pandas as pd
# from pandas import DataFrame as df
import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df_all = pd.read_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', dtype=str)
list_com = []
for num_df in range(len(df_all)):
com_name = str(df_all['企业名称'][num_df])
dic_com = {
'social_code': '',
'com_name': com_name
}
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from sys_base_enterprise where name = %s '''
cursor.execute(sel_sql, com_name)
selects = cursor.fetchone()
if selects:
print(f'【{num_df}/{len(df_all)}】==={com_name}找到')
social_code = selects[0]
else:
print(f'【{num_df}/{len(df_all)}】==={com_name}未找到')
social_code = ''
df_all['信用代码'][num_df] = str(social_code)
df_all.to_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', index=False)
\ No newline at end of file
......@@ -28,7 +28,7 @@ headers = {
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
......
......@@ -38,7 +38,7 @@ headers = {
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5OTkyNTk5NywiZXhwIjoxNzAyNTE3OTk3fQ.9iXmxFEiBdu2WYa7RwdU0xKKx7v_wBe9-QipH0TNKp9Dzk_2cZK1ESsmO1o8ICrddb5sx2cl5pjOBoaaf_9Qsg',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
......@@ -70,7 +70,7 @@ def beinWork(tyc_code, social_code,start_time):
pass
except Exception as e:
#todo:重新放入redis中
baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode',social_code)
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code)
log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
......@@ -302,10 +302,11 @@ def doJob():
continue
id = data[0]
xydm = data[2]
com_name = data[1]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
retData = getTycIdByXYDM(com_name)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
......
......@@ -43,7 +43,7 @@ class EsMethod(object):
"must": [
{
"match": {
"type": "1"
"type": "0"
}
}
]
......@@ -115,7 +115,7 @@ def main(page, p, esMethod):
attid = mms['_source']['attachmentIds'][0]
log.info(f'{id}-{attid}--{title}--{sourceAddress}---')
selects = secrchATT('1', attid)
selects = secrchATT('4', attid)
if selects:
pass
else:
......
......@@ -53,12 +53,12 @@ class EsMethod(object):
# 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
resultb = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
return resultb
def updateaunn(self, index_name, id, content, contentWithTag):
body = {
......@@ -67,24 +67,28 @@ class EsMethod(object):
'contentWithTag': contentWithTag
}
}
result = self.es.update(index=index_name
resulta = self.es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
log.info('更新结果:%s' % resulta)
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
print(len(links))
# 遍历标签,将相对地址转换为绝对地址
for link in links:
print(link)
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
# link['href'] = urljoin(listurl, link['href'])
pass
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
pass
# link['src'] = urljoin(listurl, link['src'])
return html
def get_news(news_url,ip_dic):
def get_news(news_url,sourceAddress,id):
header = {
'Host': 'www.sec.gov',
'Connection': 'keep-alive',
......@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic):
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
}
response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
response = requests.get(url=news_url,headers=header,verify=False)
# aa = response.text
# print(response.text)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
result = BeautifulSoup(response.content,'html.parser')
# result_ = BeautifulSoup(response.content,'html.parser')
result_ = BeautifulSoup(response.text, 'lxml')
# print(result)
pass
else:
# 请求失败,输出错误信息
log.info('请求失败:', response.status_code, response.text)
result = ''
return result
result_ = ''
if result_:
pass
# 相对路径转化为绝对路径
# soup = paserUrl(result_, sourceAddress)
time.sleep(2)
content = result_.text.strip()
# del(result_)
# content = result_
# print(content)
time.sleep(2)
esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))
def main(esMethod):
redis_conn = redis.Redis(connection_pool=pool)
id_ = redis_conn.lpop('NianbaoUS:id')
id = id_.decode()
# id = "23101317164"
if id:
if id_:
pass
else:
log.info('已无数据')
return
return False
id = id_.decode()
result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
result = result_['hits']['hits'][0]
num = 0
......@@ -135,17 +153,8 @@ def main(esMethod):
log.info(f'====={title}=={social_code}===正在更新===')
sourceAddress = result['_source']['sourceAddress']
ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
soup = get_news(sourceAddress,ip_dic)
if soup:
pass
else:
return
# 相对路径转化为绝对路径
soup = paserUrl(soup, sourceAddress)
content = soup.text.strip()
esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
return
get_news(sourceAddress,sourceAddress,id)
return True
def run_threads(num_threads,esMethod):
......@@ -164,6 +173,9 @@ if __name__ == '__main__':
while True:
esMethod = EsMethod()
start = time.time()
num_threads = 5
run_threads(num_threads,esMethod)
log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file
# num_threads = 5
# run_threads(num_threads,esMethod)
# log.info(f'5线程 总耗时{time.time()-start}秒')
result = main(esMethod)
if not result:
break
\ No newline at end of file
# 证监会沪市、gong深市 公司债券和企业债券采集
"""
证监会企业名单
"""
# 证监会沪市、深市 公司债券和企业债券采集
import time
import random
import requests
......@@ -25,7 +22,7 @@ cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
taskType = '企业名单/证监会'
taskType = '企业债券/证监会'
def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe'
......@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信
page = soup.find('div', class_='pages').find_all('li')[-1]
total = page.find('b').text
for i in range(1,int(total)+1):
# for i in range(1,int(total)+1):
for i in range(224, 225):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
......@@ -241,7 +239,7 @@ if __name__ == '__main__':
# url_parms = ['201010', '201014']
# url_parms = ['201011', '201013']
url_parms = ['201411', '201414', '202011', '202014']
# url_parms = ['202011', '202014']
# url_parms = ['201411']
for url_parm in url_parms:
url = getUrl(url_parm)
......
import yfinance as yf
# 获取股票数据
stock = yf.Ticker("MET")
# 获取资产负债表数据
balance_sheet = stock.balance_sheet
# 获取报告日期
report_dates = balance_sheet.index
print(report_dates)
# 获取现金流量表数据
cashflow_statement = stock.cashflow
# 获取利润表数据
income_statement = stock.financials
print(balance_sheet)
print(cashflow_statement)
print(income_statement)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
......@@ -57,8 +57,8 @@ def page_list():
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
......@@ -69,11 +69,11 @@ def page_list():
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,453):
for i in range(1,2):
log.info(f'采集第{i}页数据')
k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
......@@ -110,8 +110,8 @@ def detailpaser(dmsg):
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
......@@ -122,7 +122,7 @@ def detailpaser(dmsg):
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
try:
......
import json
import time
import uuid
import pymysql
import redis
import requests
from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings()
from obs import ObsClient
import fitz
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = baseCore.cnx_
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
pathType = 'CrowDingZhi/'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 数据入库,返回主键id传到kafka中
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
# log.info(values)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
# cnx.close()
# log.info("更新完成:{}".format(pdf_id))
return pdf_id
def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
# response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
response = requests.get(pdf_url)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(getuuid()) + '.' + category
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
if category == 'pdf':
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
else:
page_size = 0
retData['content'] = ''
break
except Exception as e:
time.sleep(3)
continue
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
log.info(f'error---{e}')
return retData
return retData
if __name__ == "__main__":
num = 0
t = int(time.time()*1000)
url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req_ = requests.get(url=url_, headers=headers)
data_json = req_.json()
print(data_json)
pageCount = data_json['pageHelp']['pageCount']
for i in range(1,int(pageCount + 1)):
url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req = requests.get(url=url, headers=headers)
data_list = req.json()['result']
for info in data_list:
publishDate = info['cmsOpDate'] # 处理日期
year = publishDate[:4]
com = '上海证券交易所'
docTitle = info['docTitle'] # 处理事由
docType = info['docType'] # 文档类型
docURL = "http://" + info['docURL'] # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
flag = r.sismember('IN-20231227-0001', docURL)
if flag:
log.info('信息已采集入库过')
continue
# 上传至obs
retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
path = retData['path']
full_path = retData['full_path']
file_size = retData['file_size']
create_by = retData['create_by']
content = retData['content']
status = 1
num += 1
create_time = time_now
# 上传到附件表
att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
if att_id:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sid = '1739914218978594817'
info_code = "IN-20231227-0001"
dic_news = {
'attachmentIds': str(att_id),
'content': content,
'contentWithTag': '',
'id': '',
'origin': com,
'publishDate': publishDate,
'sid': sid,
'sourceAddress': docURL,
'title': docTitle,
'source':'16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r.sadd(info_code, docURL)
continue
# 中央全面深化改革委员会会议
import json
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'www.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
soup = BeautifulSoup(request.content, 'html.parser')
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
year = int(publishDate[:4])
if year < 2023:
continue
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
'origin': source,
'contentWithTag': str(contentwithTag),
'content': content,
'summary': summary,
'publishDate': publishDate,
'sid': sid,
'subjectId': '1681549361661489154',
'sourceAddress':newsUrl,
'checkStatus': 1,
'deleteFlag': 0,
'createDate': time_now,
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
finally:
producer.close()
\ No newline at end of file
......@@ -27,29 +27,19 @@ class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'policy'
self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum):
body = {
"query": {
"bool": {
"must": [
{
"nested" : {
"query" : {
"bool" : {
"must" : [
{
"match_phrase" : {
"labels.relationId" : {
"query" : "1698"
}
}
"term": {
"sid.keyword": {
"value": "1662008524476948481"
}
]
}
},
"path" : "labels"
}
}
]
......@@ -112,7 +102,7 @@ def main(page, p, esMethod):
unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
# 删除重复的文档
for doc_id in unique_document_ids:
esMethod.delete(index_name="policy", id=doc_id)
esMethod.delete(index_name="researchreportdata", id=doc_id)
......
......@@ -121,7 +121,7 @@ def get_content2():
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
if '.wps' in file_href or '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
......
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/国内上市'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
social_code = '91130629MA0CG2DL51'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
\ No newline at end of file
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/中国500强'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:zg500shSocial_code')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
......@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(10)
time.sleep(20)
s = requests.session()
#获取到token和cookies
......
......@@ -239,6 +239,8 @@ if __name__=="__main__":
list_all_info = []
while True:
#一次拿取一篇文章
# todo: 从redis拿数据 更新mysql状态
dict_json =getjsonInfo()
if dict_json:
if get_info(dict_json):
......
......@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page):
cnx_.commit()
except Exception as e:
log.error(f"保存数据库失败:{e}")
# todo: 放入redis
log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount==0:
#列表为空认为结束
......
from bs4 import BeautifulSoup
import requests,time,re
from base import BaseCore
# import pandas as pd
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
log = baseCore.getLogger()
taskType = '500强专利'
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
# df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
def name_handle(english_name_):
if 'INC.' in english_name_ or 'LTD.' in english_name_ or 'CO.' in english_name_ \
or 'CORP.' in english_name_ or 'GMBH' in english_name_ \
or ' AG' in english_name_ or 'SARL' in english_name_ or 'S.A.' in english_name_ \
or 'PTY' in english_name_ or 'LLC' in english_name_ or 'LLP' in english_name_ \
or ' AB' in english_name_ or ' NV' in english_name_ or 'N.V.' in english_name_ \
or 'A.S.' in english_name_ or ' SA' in english_name_ or ',Limited' in english_name_ \
or ' SE' in english_name_ or ' LPC' in english_name_ or 'S.P.A.' in english_name_:
english_name = english_name_.replace('INC.', '').replace('LTD.', '').replace('CO.', '').replace('CORP.', '') \
.replace('GMBH', '').replace(' AG', '').replace('SARL', '').replace('S.A.', '').replace('PTY', '') \
.replace('LLC', '').replace('LLP', '').replace(' AB', '').replace(' NV', '').replace(',', '') \
.replace('A.S.', '').replace(' SA', '').replace(',Limited', '').replace(' SE', '').replace(' PLC', '') \
.replace('N.V.', '').replace('S.P.A.', '').rstrip()
return english_name
else:
english_name = english_name_
return english_name
if __name__ == '__main__':
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gwSocial_code')
# social_code = '9111000071093123XX'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
english_name_ = data[5]
place = data[6]
if place == 1:
log.info(f'{com_name}--国内')
baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
continue
if english_name_:
pass
else:
query = f"select * from sys_base_enterprise where social_credit_code ='{xydm}'"
cursor_.execute(query)
reslut = cursor_.fetchone()
english_name_ = reslut[32]
# todo:将该字段更新到144企业库
update_ = f"update EnterpriseInfo set EnglishName='{english_name_}' where SocialCode='{xydm}' "
cursor.execute(update_)
cnx.commit()
english_name_ = english_name_.upper()
english_name = name_handle(english_name_)
num_zhuanli = 0
# url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
# int(float(time.time()) * 1000))
#
# res1 = requests.get(url1, headers=headers)
# soup1 = BeautifulSoup(res1.content, 'html.parser')
#
# num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
#
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
# if zhuanli:
for year in range(2023, 1900, -1):
url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={english_name}&IN=&CPC=&IC=&rnd=' + str(
int(float(time.time()) * 1000))
# url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli == 0:
dic_info = {
'com_name': com_name,
'social_code': social_code,
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-----已经存在--{year}--无专利信息")
break
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}------新增----无专利信息")
break
dic_info = {
'com_name': com_name,
'social_code': social_code,
'year': year,
'num': zhuanli
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
except:
log.info("error!{}".format(social_code))
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
\ No newline at end of file
import requests,re,time,os,datetime,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import redis
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
for i in range(len(df_all['英文名称'])):
for num in range(0, 2):
try:
if '中国' not in df_all['企业所属国家'][i]:
com_name = df_all['英文名称'][i]
num_zhuanli = 0
url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
int(float(time.time()) * 1000))
res1 = requests.get(url1, headers=headers)
soup1 = BeautifulSoup(res1.content, 'html.parser')
num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
zhuanli = '10000'
if zhuanli == '10000':
for year in range(2023, 1900, -1):
# url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={com_name}&IN=&CPC=&IC=&rnd=' + str(
# int(float(time.time()) * 1000))
url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli2 = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli2 == 0:
break
df_all[f'{year}'][i] = zhuanli2
# num_zhuanli = num_zhuanli + zhuanli2
num_zhuanli = num_zhuanli + zhuanli2
print(year)
time.sleep(random.uniform(1.5, 2))
else:
num_zhuanli = int(zhuanli)
time.sleep(random.uniform(1.5, 2))
df_all['Espacenet专利检索'][i] = num_zhuanli
print(f"{com_name} : {num_zhuanli}")
break
except:
if num == 0:
print("重试")
time.sleep(60)
continue
else:
print("error!{}".format(df_all['英文名称'][i]))
\ No newline at end of file
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
try:
list_all = res_j['data']['items']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
return 0
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}---{social_code}---新增---无专利")
return 0
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code_zg500')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
......@@ -13,9 +13,10 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu_test
groupId=python_google
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
......@@ -168,6 +168,8 @@ class GoogleSpider(object):
try:
driver.get(url)
# 等待页面加载完成
time.sleep(3)
driver.refresh()
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
html=driver.page_source
......@@ -256,6 +258,7 @@ class GoogleSpider(object):
self.driver.get(self.url)
# 等待页面加载完成
time.sleep(3)
self.driver.refresh()
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
search_input = self.driver.find_element('xpath', '//textarea[@title="Google 搜索"]')
......@@ -265,7 +268,11 @@ class GoogleSpider(object):
time.sleep(3)
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
try:
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click()
except:
self.driver.find_element('xpath', '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span').click()
time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
time.sleep(2)
......@@ -273,7 +280,8 @@ class GoogleSpider(object):
time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e:
print(e)
self.logger.info(f'--点击按钮失效----{e}')
return
self.logger.info("开始抓取首页..." + self.searchkw )
time.sleep(5)
flag, lists = self.parse_page()
......@@ -446,7 +454,7 @@ class GoogleSpider(object):
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
......
......@@ -40,7 +40,7 @@ class GoogleTaskJob(object):
try:
for record in consumer:
try:
logger.info("value:",record.value)
logger.info(f"value:{record.value}")
keymsg=record.value
if keymsg:
break
......@@ -176,7 +176,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
import requests
url = 'https://www.ctwant.com/article/308534'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
req = requests.get(url,headers)
print(req.text)
\ No newline at end of file
......@@ -113,23 +113,23 @@ if __name__=='__main__':
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except:
continue
# if len(author)>4:
# continue
if len(author)>4:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# if '(' in author or '本刊' in author \
# or '记者' in author or '新闻社' in author \
# or '”' in author\
# or '大学' in author or '洛桑江村' in author:
# continue
if '国资委党委' in author:
pass
else:
if '(' in author or '本刊' in author \
or '记者' in author or '新闻社' in author \
or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
# if '国资委党委' in author:
# pass
# else:
# continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
......@@ -165,7 +165,7 @@ if __name__=='__main__':
}
log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href)
r.sadd('qiushileaderspeech_two::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
......@@ -55,56 +55,56 @@ from obs import ObsClient
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def use_ocr(img):
ocr = ddddocr.DdddOcr()
with open(img, 'rb') as f:
image = f.read()
res = ocr.classification(image)
print(res)
return res
if __name__=="__main__":
requests.DEFAULT_RETRIES = 5
time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'开始时间为:{time_start}')
requests.adapters.DEFAULT_RETRIES = 3
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "http://zxgk.court.gov.cn/shixin/"
browser.get(url)
# 可改动
time.sleep(20)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')
img_url = soup.select('img[id="captchaImg"]')[0]['src']
browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
browser.find_element(By.ID, 'yzm').send_keys(yzm)
browser.find_element(By.ID, 'searchForm').click()
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# baseCore = BaseCore()
# log = baseCore.getLogger()
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
#
# def use_ocr(img):
# ocr = ddddocr.DdddOcr()
# with open(img, 'rb') as f:
# image = f.read()
# res = ocr.classification(image)
# print(res)
# return res
#
# if __name__=="__main__":
# requests.DEFAULT_RETRIES = 5
# time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# log.info(f'开始时间为:{time_start}')
#
# requests.adapters.DEFAULT_RETRIES = 3
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
# }
#
# opt = webdriver.ChromeOptions()
# opt.add_argument(
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
#
# opt.add_argument("--ignore-certificate-errors")
# opt.add_argument("--ignore-ssl-errors")
# opt.add_experimental_option("excludeSwitches", ["enable-automation"])
# opt.add_experimental_option('excludeSwitches', ['enable-logging'])
# opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
# chromedriver = r'D:/cmd100/chromedriver.exe'
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
# url = "http://zxgk.court.gov.cn/shixin/"
# browser.get(url)
# # 可改动
# time.sleep(20)
# page_source = browser.page_source
# soup = BeautifulSoup(page_source, 'html.parser')
# img_url = soup.select('img[id="captchaImg"]')[0]['src']
#
# browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
#
#
# browser.find_element(By.ID, 'yzm').send_keys(yzm)
# browser.find_element(By.ID, 'searchForm').click()
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# screen_img_path = "D:/screen/xxx.png"
# out_img_path = "D:/out/xxx.png"
......@@ -112,3 +112,27 @@ if __name__=="__main__":
#
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
import requests
headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
with open('./a.pdf','wb') as f:
f.write(res.content)
\ No newline at end of file
#coding=utf-8
#coding=utf-8
......@@ -25,7 +25,7 @@ from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
# baseCore=BaseCore()
class BaiduSpider(object):
def __init__(self,searchkw,wordsCode,sid):
......@@ -40,13 +40,15 @@ class BaiduSpider(object):
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
# chrome_driver =self.config.get('selenium', 'chrome_driver')
# self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# path = Service(chrome_driver)
# chrome_options = webdriver.ChromeOptions()
# chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# proxy = baseCore.get_proxy()
# chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
......@@ -54,14 +56,16 @@ class BaiduSpider(object):
self.wordsCode = wordsCode
self.sid = sid
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -12,12 +12,16 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
from tqdm import tqdm
class BaiduTaskJob(object):
def __init__(self):
......@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
for record in tqdm(consumer, desc="Consuming messages"):
try:
logger.info("value:",record.value)
keymsg=record.value
......@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
......@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
......@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html()
except Exception as e:
try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -293,6 +293,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......@@ -304,8 +305,8 @@ class BaseCore:
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
......
[redis]
[redis]
......@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
# from baiduSpider import BaiduSpider
# from baiduSpider import BaiduSpider
# searchkw, wordsCode, sid = '', '', ''
# baidu = BaiduSpider(searchkw, wordsCode, sid)
import requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# title = '“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
# try:
# detailurl = url
# title = title
# content, contentWithTag = baidu.extractorMsg(detailurl, title)
# contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
# except Exception as e:
# content = ''
# contentWithTag = ''
#
#
# detailmsg = {
# 'title': title,
# 'detailurl': url,
# 'content': content,
# 'contentHtml': contentWithTag,
# }
# print(detailmsg)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
res_json = res.text
print(res_json)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论