提交 a0a50f7f 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

...@@ -5,7 +5,7 @@ import pymysql ...@@ -5,7 +5,7 @@ import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df_all = pd.read_excel('D:\\企业数据\\数据组提供\\2023专精特新企业名单_20240205.xlsx', dtype=str) df_all = pd.read_excel('D:\\企业数据\\数据组提供\\2023专精特新企业名单_20240205 (2)(1).xlsx', dtype=str)
list_com = [] list_com = []
for num_df in range(len(df_all)): for num_df in range(len(df_all)):
com_name = str(df_all['企业名称'][num_df]) com_name = str(df_all['企业名称'][num_df])
...@@ -27,4 +27,4 @@ for num_df in range(len(df_all)): ...@@ -27,4 +27,4 @@ for num_df in range(len(df_all)):
social_code = '' social_code = ''
df_all['信用代码列'][num_df] = str(social_code) df_all['信用代码列'][num_df] = str(social_code)
df_all.to_excel('D:\\企业数据\\数据组提供\\名单整合.xlsx', index=False) df_all.to_excel('D:\\企业数据\\数据组提供\\名单整合2.xlsx', index=False)
\ No newline at end of file \ No newline at end of file
...@@ -415,6 +415,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -415,6 +415,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
req_ = s.get(headers=headers, url=company_url) req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser') com_soup = BeautifulSoup(req_.content, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
try: try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'}) businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except: except:
...@@ -486,6 +489,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -486,6 +489,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
file.appenddata(file_name, '获取基本信息成功企业', data) file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰 # 将字段转化成英文驼峰
aa_dic = dic_handle(result_dict) aa_dic = dic_handle(result_dict)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName aa_dic['countryName'] = countryName
...@@ -496,8 +500,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -496,8 +500,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType aa_dic['listingType'] = listType
print(aa_dic) print(aa_dic)
sendkafka(aa_dic) # sendkafka(aa_dic)
# print(aa_dic) # print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
else: else:
data_baseinfo = baseinfo(com_soup) data_baseinfo = baseinfo(com_soup)
...@@ -524,6 +531,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -524,6 +531,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
file.appenddata(file_name, '获取基本信息成功企业', data) file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰 # 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo) aa_dic = dic_handle(data_baseinfo)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName aa_dic['countryName'] = countryName
...@@ -533,8 +541,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -533,8 +541,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType aa_dic['listingType'] = listType
sendkafka(aa_dic) # sendkafka(aa_dic)
print(aa_dic) print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
def remove_parentheses(text): def remove_parentheses(text):
# 清除中文小括号 # 清除中文小括号
...@@ -627,17 +638,18 @@ if __name__ == '__main__': ...@@ -627,17 +638,18 @@ if __name__ == '__main__':
'Cache-Control': 'max-age=0', 'Cache-Control': 'max-age=0',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Cookie':'TYCID=6f6298905d3011ee96146793e725899d; ssuid=3467188160; _ga=GA1.2.1049062268.1697190322; HWWAFSESID=2eb035742bde209aa60; HWWAFSESTIME=1706586308439; csrfToken=bT_looAjInHGeAnvjjl12L9v; bannerFlag=true; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=0; tyc-user-phone=%255B%252216603863075%2522%252C%2522152%25203756%25200528%2522%252C%2522159%25200367%25203315%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22310689501%22%2C%22first_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThhZDY5NmEyZWY2ODAtMGFlNWNkOTI5M2ExNTM4LTI2MDMxZjUxLTkyMTYwMC0xOGFkNjk2YTJmMGRjNSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMxMDY4OTUwMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22310689501%22%7D%2C%22%24device_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%7D; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2218703752600%22%2C%22userId%22%3A%22310689501%22%7D; tyc-user-info-save-time=1707008605562; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwNzAwODYwNSwiZXhwIjoxNzA5NjAwNjA1fQ.i8WEUrXjG2X__SnGGlnjwNXyOEdXlslrnvzvKZ_xlVA0rdjdsYHdaieAzkmIjoKbuv6Lc4Eqpb70hWIlq2zeoQ; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1705286979,1706586312; searchSessionId=1707118324.99879267;'
} }
cookies_list, id_cookie = token.get_cookies() # cookies_list, id_cookie = token.get_cookies()
cookies = {} # cookies = {}
for cookie in cookies_list: # for cookie in cookies_list:
cookies[cookie['name']] = cookie['value'] # cookies[cookie['name']] = cookie['value']
s = requests.Session() s = requests.Session()
s.cookies.update(cookies) # s.cookies.update(cookies)
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||' company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论