提交 23d4dd76 作者: 薛凌堃

24/01/02

上级 be4f79be
...@@ -464,7 +464,8 @@ def zhengquanqihuo(): ...@@ -464,7 +464,8 @@ def zhengquanqihuo():
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse(): def sse():
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761' # url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_=1703469889542'
headers = { headers = {
'Accept': '*/*', 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate', 'Accept-Encoding': 'gzip, deflate',
...@@ -485,9 +486,13 @@ def sse(): ...@@ -485,9 +486,13 @@ def sse():
# os.makedirs(path) # os.makedirs(path)
for page in range(0, int(total_page)): for page in range(0, int(total_page)):
t = int(time.time()) t = int(time.time())
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=24278800487459370386559742313666&_={t}' url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_={t}'
data = policy.getrequest_json(headers, url_page) data = policy.getrequest_json(headers, url_page)
newslist = data['data']['knowledgeList'] newslist = data['data']['knowledgeList']
# if newslist:
# pass
# else:
# continue
# print(newslist) # print(newslist)
for news in newslist: for news in newslist:
num += 1 num += 1
...@@ -521,8 +526,8 @@ def sse(): ...@@ -521,8 +526,8 @@ def sse():
content = '' content = ''
response = requests.get(newsUrl, timeout=20) response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages(): for page_ in doc.pages():
content += page.get_text() content += page_.get_text()
file_href = newsUrl file_href = newsUrl
file_name = title file_name = title
...@@ -628,7 +633,7 @@ def sse(): ...@@ -628,7 +633,7 @@ def sse():
for att_id in id_list: for att_id in id_list:
baseCore.deliteATT(att_id) baseCore.deliteATT(att_id)
except Exception as e: except Exception as e:
log.info(f"error!!!{newsUrl}") log.info(f"error!!!{newsUrl}===={title}")
log.info(e) log.info(e)
log.info(f'====第{page}页====处理结束,================') log.info(f'====第{page}页====处理结束,================')
...@@ -972,14 +977,14 @@ def guizhou(): ...@@ -972,14 +977,14 @@ def guizhou():
if __name__=="__main__": if __name__=="__main__":
# file_path = f'data/REITs贵州省人民政府.xlsx' # file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path) # wb = policy.createfile(file_path)
reform() # reform()
# shenzhen() # # shenzhen()
zhengquanqihuo() # zhengquanqihuo()
try: try:
sse() sse()
except: except:
pass pass
hebei() # hebei()
guizhou() # guizhou()
# zhengquanqihuo() # zhengquanqihuo()
\ No newline at end of file
...@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen ...@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__": if __name__ == "__main__":
beijing() beijing()
reits.sse() reits.sse()
reits.reform() reits.reform()
......
...@@ -674,7 +674,7 @@ if __name__ == "__main__": ...@@ -674,7 +674,7 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
# MengZhi() # MengZhi()
# NQEnterprise() NQEnterprise()
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
...@@ -683,6 +683,6 @@ if __name__ == "__main__": ...@@ -683,6 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task() # AnnualEnterprise_task()
# FinanceFromEast() # FinanceFromEast()
# ipo_code() # ipo_code()
JingyingfenxiFromEase() # JingyingfenxiFromEase()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===') log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
...@@ -43,7 +43,7 @@ class EsMethod(object): ...@@ -43,7 +43,7 @@ class EsMethod(object):
"must": [ "must": [
{ {
"match": { "match": {
"type": "1" "type": "0"
} }
} }
] ]
...@@ -115,7 +115,7 @@ def main(page, p, esMethod): ...@@ -115,7 +115,7 @@ def main(page, p, esMethod):
attid = mms['_source']['attachmentIds'][0] attid = mms['_source']['attachmentIds'][0]
log.info(f'{id}-{attid}--{title}--{sourceAddress}---') log.info(f'{id}-{attid}--{title}--{sourceAddress}---')
selects = secrchATT('1', attid) selects = secrchATT('4', attid)
if selects: if selects:
pass pass
else: else:
......
...@@ -53,12 +53,12 @@ class EsMethod(object): ...@@ -53,12 +53,12 @@ class EsMethod(object):
# 'hits.hits._source.createDate', # 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate', # 'hits.hits._source.publishDate',
] # 字段2 ] # 字段2
result = self.es.search(index=index_name resultb = self.es.search(index=index_name
, doc_type='_doc' , doc_type='_doc'
, filter_path=filter_path , filter_path=filter_path
, body=body) , body=body)
# log.info(result) # log.info(result)
return result return resultb
def updateaunn(self, index_name, id, content, contentWithTag): def updateaunn(self, index_name, id, content, contentWithTag):
body = { body = {
...@@ -67,24 +67,28 @@ class EsMethod(object): ...@@ -67,24 +67,28 @@ class EsMethod(object):
'contentWithTag': contentWithTag 'contentWithTag': contentWithTag
} }
} }
result = self.es.update(index=index_name resulta = self.es.update(index=index_name
,id=id ,id=id
,body=body) ,body=body)
log.info('更新结果:%s' % result) log.info('更新结果:%s' % resulta)
def paserUrl(html,listurl): def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser') # soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签 # 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img']) links = html.find_all(['a', 'img'])
print(len(links))
# 遍历标签,将相对地址转换为绝对地址 # 遍历标签,将相对地址转换为绝对地址
for link in links: for link in links:
print(link)
if 'href' in link.attrs: if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href']) # link['href'] = urljoin(listurl, link['href'])
pass
elif 'src' in link.attrs: elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src']) pass
# link['src'] = urljoin(listurl, link['src'])
return html return html
def get_news(news_url,ip_dic): def get_news(news_url,sourceAddress,id):
header = { header = {
'Host': 'www.sec.gov', 'Host': 'www.sec.gov',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic): ...@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic):
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D' 'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
} }
response = requests.get(url=news_url,headers=header,verify=False,timeout=30) response = requests.get(url=news_url,headers=header,verify=False)
# aa = response.text
# print(response.text)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30) # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if response.status_code == 200: if response.status_code == 200:
# 请求成功,处理响应数据 # 请求成功,处理响应数据
# print(response.text) # print(response.text)
result = BeautifulSoup(response.content,'html.parser') # result_ = BeautifulSoup(response.content,'html.parser')
result_ = BeautifulSoup(response.text, 'lxml')
# print(result) # print(result)
pass pass
else: else:
# 请求失败,输出错误信息 # 请求失败,输出错误信息
log.info('请求失败:', response.status_code, response.text) log.info('请求失败:', response.status_code, response.text)
result = '' result_ = ''
return result if result_:
pass
# 相对路径转化为绝对路径
# soup = paserUrl(result_, sourceAddress)
time.sleep(2)
content = result_.text.strip()
# del(result_)
# content = result_
# print(content)
time.sleep(2)
esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))
def main(esMethod): def main(esMethod):
redis_conn = redis.Redis(connection_pool=pool) redis_conn = redis.Redis(connection_pool=pool)
id_ = redis_conn.lpop('NianbaoUS:id') id_ = redis_conn.lpop('NianbaoUS:id')
id = id_.decode()
# id = "23101317164" # id = "23101317164"
if id: if id_:
pass pass
else: else:
log.info('已无数据') log.info('已无数据')
return return False
id = id_.decode()
result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id) result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
result = result_['hits']['hits'][0] result = result_['hits']['hits'][0]
num = 0 num = 0
...@@ -135,17 +153,8 @@ def main(esMethod): ...@@ -135,17 +153,8 @@ def main(esMethod):
log.info(f'====={title}=={social_code}===正在更新===') log.info(f'====={title}=={social_code}===正在更新===')
sourceAddress = result['_source']['sourceAddress'] sourceAddress = result['_source']['sourceAddress']
ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'} ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
soup = get_news(sourceAddress,ip_dic) get_news(sourceAddress,sourceAddress,id)
if soup: return True
pass
else:
return
# 相对路径转化为绝对路径
soup = paserUrl(soup, sourceAddress)
content = soup.text.strip()
esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
return
def run_threads(num_threads,esMethod): def run_threads(num_threads,esMethod):
...@@ -164,6 +173,9 @@ if __name__ == '__main__': ...@@ -164,6 +173,9 @@ if __name__ == '__main__':
while True: while True:
esMethod = EsMethod() esMethod = EsMethod()
start = time.time() start = time.time()
num_threads = 5 # num_threads = 5
run_threads(num_threads,esMethod) # run_threads(num_threads,esMethod)
log.info(f'5线程 总耗时{time.time()-start}秒') # log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file result = main(esMethod)
if not result:
break
\ No newline at end of file
# 证监会沪市、gong深市 公司债券和企业债券采集 # 证监会沪市、深市 公司债券和企业债券采集
"""
证监会企业名单
"""
import time import time
import random import random
import requests import requests
...@@ -25,7 +22,7 @@ cursor = baseCore.cursor ...@@ -25,7 +22,7 @@ cursor = baseCore.cursor
cnx_ = baseCore.cnx_ cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_ cursor_ = baseCore.cursor_
taskType = '企业名单/证监会' taskType = '企业债券/证监会'
def createDriver(): def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe' chrome_driver = r'D:\cmd100\chromedriver.exe'
...@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信 ...@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信
page = soup.find('div', class_='pages').find_all('li')[-1] page = soup.find('div', class_='pages').find_all('li')[-1]
total = page.find('b').text total = page.find('b').text
for i in range(1,int(total)+1): # for i in range(1,int(total)+1):
for i in range(224, 225):
log.info(f'==========正在采集第{i}页=========') log.info(f'==========正在采集第{i}页=========')
if i == 1: if i == 1:
href = url href = url
...@@ -241,7 +239,7 @@ if __name__ == '__main__': ...@@ -241,7 +239,7 @@ if __name__ == '__main__':
# url_parms = ['201010', '201014'] # url_parms = ['201010', '201014']
# url_parms = ['201011', '201013'] # url_parms = ['201011', '201013']
url_parms = ['201411', '201414', '202011', '202014'] url_parms = ['201411', '201414', '202011', '202014']
# url_parms = ['202011', '202014'] # url_parms = ['201411']
for url_parm in url_parms: for url_parm in url_parms:
url = getUrl(url_parm) url = getUrl(url_parm)
......
import yfinance as yf
# 获取股票数据
stock = yf.Ticker("MET")
# 获取资产负债表数据
balance_sheet = stock.balance_sheet
# 获取报告日期
report_dates = balance_sheet.index
print(report_dates)
# 获取现金流量表数据
cashflow_statement = stock.cashflow
# 获取利润表数据
income_statement = stock.financials
print(balance_sheet)
print(cashflow_statement)
print(income_statement)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
# import yfinance as yf
#
# # 获取股票数据
# stock = yf.Ticker("AAPL")
#
# # 获取历史价格数据
# historical_prices = stock.history(period="max")
#
# # 获取市值数据
# market_cap = stock.info["marketCap"]
#
# print(historical_prices)
# print(market_cap)
...@@ -57,8 +57,8 @@ def page_list(): ...@@ -57,8 +57,8 @@ def page_list():
'Content-Length': '25', 'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks', 'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap', 'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5', 'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'xweb_xhr': '1', 'xweb_xhr': '1',
'dgd-pre-release': '0', 'dgd-pre-release': '0',
...@@ -69,11 +69,11 @@ def page_list(): ...@@ -69,11 +69,11 @@ def page_list():
'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html', 'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br' 'Accept-Encoding': 'gzip, deflate, br'
} }
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList' url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,453): for i in range(1,2):
log.info(f'采集第{i}页数据') log.info(f'采集第{i}页数据')
k=i k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}' da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
...@@ -110,8 +110,8 @@ def detailpaser(dmsg): ...@@ -110,8 +110,8 @@ def detailpaser(dmsg):
'Content-Length': '25', 'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks', 'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap', 'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5', 'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'xweb_xhr': '1', 'xweb_xhr': '1',
'dgd-pre-release': '0', 'dgd-pre-release': '0',
...@@ -122,7 +122,7 @@ def detailpaser(dmsg): ...@@ -122,7 +122,7 @@ def detailpaser(dmsg):
'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html', 'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br' 'Accept-Encoding': 'gzip, deflate, br'
} }
try: try:
......
import json
import time
import uuid
import pymysql
import redis
import requests
from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings()
from obs import ObsClient
import fitz
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = baseCore.cnx_
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
pathType = 'CrowDingZhi/'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 数据入库,返回主键id传到kafka中
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
# log.info(values)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
# cnx.close()
# log.info("更新完成:{}".format(pdf_id))
return pdf_id
def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
# response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
response = requests.get(pdf_url)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(getuuid()) + '.' + category
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
if category == 'pdf':
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
else:
page_size = 0
retData['content'] = ''
break
except Exception as e:
time.sleep(3)
continue
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
log.info(f'error---{e}')
return retData
return retData
if __name__ == "__main__":
num = 0
t = int(time.time()*1000)
url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req_ = requests.get(url=url_, headers=headers)
data_json = req_.json()
print(data_json)
pageCount = data_json['pageHelp']['pageCount']
for i in range(1,int(pageCount + 1)):
url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
req = requests.get(url=url, headers=headers)
data_list = req.json()['result']
for info in data_list:
publishDate = info['cmsOpDate'] # 处理日期
year = publishDate[:4]
com = '上海证券交易所'
docTitle = info['docTitle'] # 处理事由
docType = info['docType'] # 文档类型
docURL = "http://" + info['docURL'] # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
flag = r.sismember('IN-20231227-0001', docURL)
if flag:
log.info('信息已采集入库过')
continue
# 上传至obs
retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
path = retData['path']
full_path = retData['full_path']
file_size = retData['file_size']
create_by = retData['create_by']
content = retData['content']
status = 1
num += 1
create_time = time_now
# 上传到附件表
att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
if att_id:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sid = '1739914218978594817'
info_code = "IN-20231227-0001"
dic_news = {
'attachmentIds': str(att_id),
'content': content,
'contentWithTag': '',
'id': '',
'origin': com,
'publishDate': publishDate,
'sid': sid,
'sourceAddress': docURL,
'title': docTitle,
'source':'16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r.sadd(info_code, docURL)
continue
# 中央全面深化改革委员会会议
import json
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'www.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
soup = BeautifulSoup(request.content, 'html.parser')
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
year = int(publishDate[:4])
if year < 2023:
continue
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
'origin': source,
'contentWithTag': str(contentwithTag),
'content': content,
'summary': summary,
'publishDate': publishDate,
'sid': sid,
'subjectId': '1681549361661489154',
'sourceAddress':newsUrl,
'checkStatus': 1,
'deleteFlag': 0,
'createDate': time_now,
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
finally:
producer.close()
\ No newline at end of file
...@@ -27,29 +27,19 @@ class EsMethod(object): ...@@ -27,29 +27,19 @@ class EsMethod(object):
def __init__(self): def __init__(self):
# 创建Elasticsearch对象,并提供账号信息 # 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300) self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'policy' self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum): def queryatt(self,index_name,pnum):
body = { body = {
"query": { "query": {
"bool": { "bool": {
"must": [ "must": [
{ {
"nested" : { "term": {
"query" : { "sid.keyword": {
"bool" : { "value": "1662008524476948481"
"must" : [
{
"match_phrase" : {
"labels.relationId" : {
"query" : "1698"
}
}
} }
]
}
},
"path" : "labels"
} }
} }
] ]
...@@ -112,7 +102,7 @@ def main(page, p, esMethod): ...@@ -112,7 +102,7 @@ def main(page, p, esMethod):
unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents] unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
# 删除重复的文档 # 删除重复的文档
for doc_id in unique_document_ids: for doc_id in unique_document_ids:
esMethod.delete(index_name="policy", id=doc_id) esMethod.delete(index_name="researchreportdata", id=doc_id)
......
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/国内上市'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
...@@ -56,7 +56,7 @@ if __name__=="__main__": ...@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
# 可改动 # 可改动
time.sleep(10) time.sleep(20)
s = requests.session() s = requests.session()
#获取到token和cookies #获取到token和cookies
......
...@@ -239,6 +239,8 @@ if __name__=="__main__": ...@@ -239,6 +239,8 @@ if __name__=="__main__":
list_all_info = [] list_all_info = []
while True: while True:
#一次拿取一篇文章 #一次拿取一篇文章
# todo: 从redis拿数据 更新mysql状态
dict_json =getjsonInfo() dict_json =getjsonInfo()
if dict_json: if dict_json:
if get_info(dict_json): if get_info(dict_json):
......
...@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page): ...@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page):
cnx_.commit() cnx_.commit()
except Exception as e: except Exception as e:
log.error(f"保存数据库失败:{e}") log.error(f"保存数据库失败:{e}")
# todo: 放入redis
log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------") log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount==0: if listCount==0:
#列表为空认为结束 #列表为空认为结束
......
from bs4 import BeautifulSoup
import requests,time,re
from base import BaseCore
# import pandas as pd
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
log = baseCore.getLogger()
taskType = '500强专利'
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
# df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
def name_handle(english_name_):
if 'INC.' in english_name_ or 'LTD.' in english_name_ or 'CO.' in english_name_ \
or 'CORP.' in english_name_ or 'GMBH' in english_name_ \
or ' AG' in english_name_ or 'SARL' in english_name_ or 'S.A.' in english_name_ \
or 'PTY' in english_name_ or 'LLC' in english_name_ or 'LLP' in english_name_ \
or ' AB' in english_name_ or ' NV' in english_name_ or 'N.V.' in english_name_ \
or 'A.S.' in english_name_ or ' SA' in english_name_ or ',Limited' in english_name_ \
or ' SE' in english_name_ or ' LPC' in english_name_ or 'S.P.A.' in english_name_:
english_name = english_name_.replace('INC.', '').replace('LTD.', '').replace('CO.', '').replace('CORP.', '') \
.replace('GMBH', '').replace(' AG', '').replace('SARL', '').replace('S.A.', '').replace('PTY', '') \
.replace('LLC', '').replace('LLP', '').replace(' AB', '').replace(' NV', '').replace(',', '') \
.replace('A.S.', '').replace(' SA', '').replace(',Limited', '').replace(' SE', '').replace(' PLC', '') \
.replace('N.V.', '').replace('S.P.A.', '').rstrip()
return english_name
else:
english_name = english_name_
return english_name
if __name__ == '__main__':
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gwSocial_code')
# social_code = '9111000071093123XX'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
english_name_ = data[5]
place = data[6]
if place == 1:
log.info(f'{com_name}--国内')
baseCore.rePutIntoR('ZhuanLi_500:zgSocial_code',social_code)
continue
if english_name_:
pass
else:
query = f"select * from sys_base_enterprise where social_credit_code ='{xydm}'"
cursor_.execute(query)
reslut = cursor_.fetchone()
english_name_ = reslut[32]
# todo:将该字段更新到144企业库
update_ = f"update EnterpriseInfo set EnglishName='{english_name_}' where SocialCode='{xydm}' "
cursor.execute(update_)
cnx.commit()
english_name_ = english_name_.upper()
english_name = name_handle(english_name_)
num_zhuanli = 0
# url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
# int(float(time.time()) * 1000))
#
# res1 = requests.get(url1, headers=headers)
# soup1 = BeautifulSoup(res1.content, 'html.parser')
#
# num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
#
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
# if zhuanli:
for year in range(2023, 1900, -1):
url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={english_name}&IN=&CPC=&IC=&rnd=' + str(
int(float(time.time()) * 1000))
# url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli == 0:
dic_info = {
'com_name': com_name,
'social_code': social_code,
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-----已经存在--{year}--无专利信息")
break
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}------新增----无专利信息")
break
dic_info = {
'com_name': com_name,
'social_code': social_code,
'year': year,
'num': zhuanli
}
# 插入数据库表中
selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_500(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
except:
log.info("error!{}".format(social_code))
baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
continue
\ No newline at end of file
import requests,re,time,os,datetime,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import redis
# headers = {
# "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
# }
df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
# for i in range(2022,1890,-1):
# df_all[f'{i}'] = ''
# df_all['Espacenet专利检索'] = ''
headers = {
"Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
for i in range(len(df_all['英文名称'])):
for num in range(0, 2):
try:
if '中国' not in df_all['企业所属国家'][i]:
com_name = df_all['英文名称'][i]
num_zhuanli = 0
url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
int(float(time.time()) * 1000))
res1 = requests.get(url1, headers=headers)
soup1 = BeautifulSoup(res1.content, 'html.parser')
num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
# try:
# zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
# except:
# zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
zhuanli = '10000'
if zhuanli == '10000':
for year in range(2023, 1900, -1):
# url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={com_name}&IN=&CPC=&IC=&rnd=' + str(
# int(float(time.time()) * 1000))
url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'html.parser')
num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
try:
try:
zhuanli2 = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
except:
zhuanli2 = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
if zhuanli2 == 0:
break
df_all[f'{year}'][i] = zhuanli2
# num_zhuanli = num_zhuanli + zhuanli2
num_zhuanli = num_zhuanli + zhuanli2
print(year)
time.sleep(random.uniform(1.5, 2))
else:
num_zhuanli = int(zhuanli)
time.sleep(random.uniform(1.5, 2))
df_all['Espacenet专利检索'][i] = num_zhuanli
print(f"{com_name} : {num_zhuanli}")
break
except:
if num == 0:
print("重试")
time.sleep(60)
continue
else:
print("error!{}".format(df_all['英文名称'][i]))
\ No newline at end of file
...@@ -53,7 +53,27 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info): ...@@ -53,7 +53,27 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
time.sleep(2) time.sleep(2)
continue continue
# print(res_j) # print(res_j)
try:
list_all = res_j['data']['items'] list_all = res_j['data']['items']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
return 0
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}---{social_code}---新增---无专利")
return 0
# print(list_all) # print(list_all)
if list_all: if list_all:
for one_zhuanli in list_all: for one_zhuanli in list_all:
...@@ -140,7 +160,7 @@ if __name__ == "__main__": ...@@ -140,7 +160,7 @@ if __name__ == "__main__":
list_all_info = [] list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code') social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# social_code = '9111010566840059XP' # social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
# time.sleep(20) # time.sleep(20)
......
...@@ -113,23 +113,23 @@ if __name__=='__main__': ...@@ -113,23 +113,23 @@ if __name__=='__main__':
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '') author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except: except:
continue continue
# if len(author)>4: if len(author)>4:
# continue continue
# if '(' in author or '本刊' in author or '国家' in author\ # if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\ # or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\ # or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author : # or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# if '(' in author or '本刊' in author \ if '(' in author or '本刊' in author \
# or '记者' in author or '新闻社' in author \ or '记者' in author or '新闻社' in author \
# or '”' in author\ or '”' in author\
# or '大学' in author or '洛桑江村' in author: or '大学' in author or '洛桑江村' in author:
# continue
if '国资委党委' in author:
pass
else:
continue continue
# if '国资委党委' in author:
# pass
# else:
# continue
new_href = new.find('a')['href'] new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href) is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
if is_member: if is_member:
continue continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '') new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
...@@ -165,7 +165,7 @@ if __name__=='__main__': ...@@ -165,7 +165,7 @@ if __name__=='__main__':
} }
log.info(dic_news) log.info(dic_news)
if sendKafka(dic_news): if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href) r.sadd('qiushileaderspeech_two::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}') log.info(f'采集成功----{dic_news["sourceAddress"]}')
...@@ -55,56 +55,56 @@ from obs import ObsClient ...@@ -55,56 +55,56 @@ from obs import ObsClient
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() # baseCore = BaseCore()
log = baseCore.getLogger() # log = baseCore.getLogger()
cnx_ = baseCore.cnx # cnx_ = baseCore.cnx
cursor_ = baseCore.cursor # cursor_ = baseCore.cursor
#
def use_ocr(img): # def use_ocr(img):
ocr = ddddocr.DdddOcr() # ocr = ddddocr.DdddOcr()
with open(img, 'rb') as f: # with open(img, 'rb') as f:
image = f.read() # image = f.read()
res = ocr.classification(image) # res = ocr.classification(image)
print(res) # print(res)
return res # return res
#
if __name__=="__main__": # if __name__=="__main__":
requests.DEFAULT_RETRIES = 5 # requests.DEFAULT_RETRIES = 5
time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'开始时间为:{time_start}') # log.info(f'开始时间为:{time_start}')
#
requests.adapters.DEFAULT_RETRIES = 3 # requests.adapters.DEFAULT_RETRIES = 3
headers = { # headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
} # }
#
opt = webdriver.ChromeOptions() # opt = webdriver.ChromeOptions()
opt.add_argument( # opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36') # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
#
opt.add_argument("--ignore-certificate-errors") # opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors") # opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"]) # opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging']) # opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False) # opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe' # opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe' # chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) # browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "http://zxgk.court.gov.cn/shixin/" # url = "http://zxgk.court.gov.cn/shixin/"
browser.get(url) # browser.get(url)
# 可改动 # # 可改动
time.sleep(20) # time.sleep(20)
page_source = browser.page_source # page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser') # soup = BeautifulSoup(page_source, 'html.parser')
img_url = soup.select('img[id="captchaImg"]')[0]['src'] # img_url = soup.select('img[id="captchaImg"]')[0]['src']
#
browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司') # browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
#
#
browser.find_element(By.ID, 'yzm').send_keys(yzm) # browser.find_element(By.ID, 'yzm').send_keys(yzm)
browser.find_element(By.ID, 'searchForm').click() # browser.find_element(By.ID, 'searchForm').click()
wait = WebDriverWait(browser, 30) # wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# screen_img_path = "D:/screen/xxx.png" # screen_img_path = "D:/screen/xxx.png"
# out_img_path = "D:/out/xxx.png" # out_img_path = "D:/out/xxx.png"
...@@ -112,3 +112,27 @@ if __name__=="__main__": ...@@ -112,3 +112,27 @@ if __name__=="__main__":
# #
# code = use_ocr(out_img_path) # code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code) # 验证码输入框元素.send_keys(code)
import requests
headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
with open('./a.pdf','wb') as f:
f.write(res.content)
\ No newline at end of file
# from baiduSpider import BaiduSpider
# from baiduSpider import BaiduSpider
# searchkw, wordsCode, sid = '', '', ''
# baidu = BaiduSpider(searchkw, wordsCode, sid)
import requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# title = '“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
# try:
# detailurl = url
# title = title
# content, contentWithTag = baidu.extractorMsg(detailurl, title)
# contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
# except Exception as e:
# content = ''
# contentWithTag = ''
#
#
# detailmsg = {
# 'title': title,
# 'detailurl': url,
# 'content': content,
# 'contentHtml': contentWithTag,
# }
# print(detailmsg)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
res_json = res.text
print(res_json)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论