提交 862e97ab 作者: 薛凌堃

1/31

上级 1d1053c8
......@@ -2,32 +2,99 @@
#先采集天眼查id,再通过id采集核心人员信息
import datetime
import json
import os
import subprocess
import sys
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie':'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22307016917%22%2C%22first_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22307016917%22%7D%2C%22%24device_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%7D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213592481839%22%7D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412',
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'Cookie': 'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103126138%22%2C%22userId%22:%22304029617%22}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=%255B%252217103126138%2522%255D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22304029617%22%2C%22first_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22304029617%22%7D%2C%22%24device_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员'
ip_num = 0
def get_proxy(ip_num):
sql = "select proxy from clb_proxy"
cursor_.execute(sql)
proxy_lists = cursor_.fetchall()
cnx_.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[ip_num]
@retry(tries=3, delay=1)
def get_html(tycid, ip_num):
url = f"https://www.tianyancha.com/company/{tycid}"
ip = get_proxy(ip_num)
response = requests.get(url=url, headers=headers, proxies=ip)
if response.status_code == 200:
pass
else:
ip_num += 1
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
except:
return 0
@retry(tries=3, delay=1)
def get_page(url, ip_num):
ip = get_proxy(ip_num)
res = requests.get(url=url, headers=headers, proxies=ip)
if res.status_code == 200:
pass
else:
ip_num += 1
raise
time.sleep(1)
total_page_ = res.json()['data']['total']
return total_page_
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '9135020056842712XB'
social_code = '91320691550279691N'
if social_code == None:
time.sleep(20)
continue
......@@ -35,15 +102,29 @@ def doJob():
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
#数据重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else:
#数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
cursor_.execute(insert, (com_name, xydm, social_code))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name)
......@@ -58,28 +139,111 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
for page in range(1,2):
t = int(time.time()*1000)
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
#todo:先确定接口走哪个
try:
charge = get_html(tycid, ip_num)
except Exception as e:
charge = -1
log.info(e)
total_page = 0
t = int(time.time() * 1000)
if charge == -1:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, ip_num)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, ip_num)
except:
total_page2 = 0
time.sleep(2)
try:
total_page3 = get_page(url3, ip_num)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
continue
if total_page == 0:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
#todo:获取页数
time.sleep(2)
for page in range(1, int((total_page/20) + 1)+1):
for c in range(3):
ip = baseCore.get_proxy()
# res = requests.get(url,headers=headers,proxies=ip) # ,verify=False
res = requests.get(url,headers=headers) # ,verify=False
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers, proxies=ip) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
......@@ -135,15 +299,7 @@ def doJob():
num = num+1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res = requests.get(url, headers=headers) # ,verify=False
time.sleep(1)
list_all = res.json()['data']['dataList']
if list_all:
elif flag == 3:
for one_info in list_all:
name = one_info['personal_name']
try:
......@@ -153,8 +309,13 @@ def doJob():
education = ''
position = one_info['position_name']
Salary = ''
try:
birthYear = one_info['year_of_birth']
except:
birthYear = ''
personInfo = one_info['resume_cn']
timestamp = int(int(one_info['employ_date'])/10000)
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
dic_json = {
"socialCreditCode": social_code,
"name": name,
......@@ -166,53 +327,20 @@ def doJob():
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"currentTerm": currentTerm+'至-',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res = requests.get(url, headers=headers) # ,verify=False
time.sleep(1)
list_all = res.json()['data']['result']
# todo:增加一种情况
if list_all:
for one_info in list_all:
name = one_info['name']
try:
sex = one_info['sex']
except:
sex = ''
try:
education = one_info['education']
except:
education = ''
try:
position = one_info['typeSore']
except:
position = ''
try:
Salary = one_info['salary']
except:
Salary = ''
birthYear = ''
try:
shareRatio = one_info['percent']
except:
shareRatio = ''
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
try:
currentTerm = one_info['term']
except:
currentTerm = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
......@@ -229,29 +357,29 @@ def doJob():
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"sex": '',
"education": '',
"position": position,
"salary": Salary,
"birthYear": birthYear,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": currentTerm,
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"sex": '',
"education": '',
"position": position,
"salary": Salary,
"birthYear": birthYear,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
......@@ -259,7 +387,7 @@ def doJob():
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
......@@ -272,7 +400,7 @@ def doJob():
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
......
......@@ -7,6 +7,8 @@ import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
......@@ -19,77 +21,207 @@ headers = {
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员'
requests.adapters.DEFAULT_RETRIES = 5
ip_num = 0
@retry(tries=3, delay=1)
def get_html(tycid):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
except:
return 0
@retry(tries=3, delay=1)
def get_page(url):
# ip = baseCore.get_proxy()
res = requests.get(url=url, headers=headers)
time.sleep(1)
total_page_ = res.json()['data']['total']
return total_page_
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code = '91510000207312079C'
# social_code = '91320691550279691N'
if social_code == None:
time.sleep(20)
continue
if 'ZZSN' in social_code:
continue
start = time.time()
try:
# data = baseCore.getInfomation(social_code)
# if len(data) != 0:
# pass
# else:
# #数据重新塞入redis
# baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
data = baseCore.getInfomation(social_code)
if len(data) != 0:
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
cursor_.execute(insert, (com_name, xydm, social_code))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# id = data[0]
# com_name = data[1]
# xydm = data[2]
tycid = ''
# tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(social_code)
retData = getTycIdByXYDM(com_name)
if retData['state']:
tycid = retData['tycData']['id']
# todo:写入数据库
# updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
# cursor_.execute(updateSql)
# cnx_.commit()
# # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
except Exception as e:
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
# count = data[17]
log.info(f"---{social_code}----{tycid}----开始采集核心人员")
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
for page in range(1,2):
t = int(time.time()*1000)
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url,headers=headers,proxies=ip,verify=False)
# todo:先确定接口走哪个
try:
charge = get_html(tycid)
except Exception as e:
charge = -1
log.info(e)
time.sleep(2)
t = int(time.time() * 1000)
if charge == -1:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2)
except:
total_page2 = 0
time.sleep(1)
try:
total_page3 = get_page(url3)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
continue
if total_page == 0:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
break
# todo:获取页数
for page in range(1, int((total_page / 20) + 1) + 1):
for c in range(3):
# ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
#todo:获取当前年份
# todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
......@@ -105,47 +237,40 @@ def doJob():
except:
person_img = '--'
dic_json = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"sort":str(num)
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"头像":person_img,
"sort":str(num)
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num+1
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip, verify=False)
time.sleep(1)
list_all = res.json()['data']['dataList']
if list_all:
elif flag == 3:
for one_info in list_all:
name = one_info['personal_name']
try:
......@@ -155,8 +280,13 @@ def doJob():
education = ''
position = one_info['position_name']
Salary = ''
try:
birthYear = one_info['year_of_birth']
except:
birthYear = ''
personInfo = one_info['resume_cn']
timestamp = int(int(one_info['employ_date']) / 10000)
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
dic_json = {
"socialCreditCode": social_code,
"name": name,
......@@ -168,59 +298,24 @@ def doJob():
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"currentTerm": currentTerm + '至-',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip, verify=False)
time.sleep(1)
try:
list_all = res.json()['data']['result']
except Exception as e:
log.info(res.json())
continue
# todo:增加一种情况
if list_all:
for one_info in list_all:
name = one_info['name']
try:
sex = one_info['sex']
except:
sex = ''
try:
education = one_info['education']
except:
education = ''
try:
position = one_info['typeSore']
except:
position = ''
try:
Salary = one_info['salary']
except:
Salary = ''
birthYear = ''
try:
shareRatio = one_info['percent']
except:
shareRatio = ''
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
try:
currentTerm = one_info['term']
except:
currentTerm = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_res = requests.get(person_url, headers=headers, proxies=ip)
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
......@@ -233,29 +328,29 @@ def doJob():
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"sex": '',
"education": '',
"position": position,
"salary": Salary,
"birthYear": birthYear,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": currentTerm,
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"sex": '',
"education": '',
"position": position,
"salary": Salary,
"birthYear": birthYear,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
......@@ -263,25 +358,28 @@ def doJob():
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
log.indo(f'---{social_code}---无高管信息---')
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False)
response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text)
log.info('=========成功======')
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
......
......@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break
except Exception as e:
time.sleep(3)
log.info(e)
continue
if page_size < 1:
......@@ -206,7 +207,8 @@ def download(data, order_by,header):
come = data['come']
except:
come = ''
if publishDate < '2024-01-29':
return
tf_url = add_check_url(sourceAddress)
if tf_url:
dic_result = {
......@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku()
# except Exception as e:
# pass
try:
log.info('shijiejingjiluntan')
shijiejingjiluntan()
except Exception as e:
log.info(e)
pass
# try:
# log.info('shijiejingjiluntan')
# shijiejingjiluntan()
# except Exception as e:
# log.info(e)
# pass
# try:
# log.info('dongfangcaifu')
# dongfangcaifu()
......@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu4')
# dongfangcaifu4()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu5')
# dongfangcaifu5()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu6')
# dongfangcaifu6()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu7')
# dongfangcaifu7()
# except Exception as e:
# log.info(e)
# pass
try:
log.info('dongfangcaifu4')
dongfangcaifu4()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu5')
dongfangcaifu5()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu6')
dongfangcaifu6()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu7')
dongfangcaifu7()
except Exception as e:
log.info(e)
pass
import requests
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
def two_dfsm_mtgc():
info_list = []
"""
地方扫描
"""
url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for url in url_list:
res = requests.get(url=url,headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
# for page in range(378,int(pages)+1):
for page in range(1,378):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else:
url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0019-test', href)
if flag:
log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except Exception as e:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result =result.find('p').text
pub_source = result.split('发布时间:')[0].replace('文章来源:','').strip()
pub_time = result.split('发布时间:')[1]
# print(pub_source,pub_time)
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian').find_all('span')
try:
pub_source = str(result[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source =''
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
info_code = 'IN-20240129-0019'
result_dict = {
'id': '',
'sid': '1751849444877144065',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
two_dfsm_mtgc()
\ No newline at end of file
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
#国资要闻
def gzyw():
info_list = []
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
# pages = soup.find('td',id='pag_4278129')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
for page in range(1, int(pages)+1):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0002-test', href)
if flag:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except Exception as e:
continue
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result_ =result.find('p').text
pub_source = result_.split('发布时间:')[0].replace('文章来源:', '').strip()
pub_time = result_.split('发布时间:')[1]
# print(pub_source,pub_time)
if title == '':
result.find('p').decompose()
title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian')
if result:
result_ = result.find_all('span')
try:
pub_source = str(result_[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result_[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result_[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source = ''
if title == '':
result.find('p').decompose()
title = result.text.strip()
contentWithTag = str(i_soup.find(class_='articlecontent'))
content = str(i_soup.find(class_='articlecontent').text)
else:
result = i_soup.find(class_='pages-date')
pub_source = result.find('span').text.replace('来源:', '').strip()
pub_time = result.text
pub_time = pub_time.split('来源')[0].strip()
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='pages_content').text)
# content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code = 'IN-20240129-0002'
result_dict = {
'id':'',
'sid':'1751810519211053058',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source':16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
gzyw()
\ No newline at end of file
"""
中证智能财讯
"""
import json
import requests
from bs4 import BeautifulSoup
def zzcx():
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Length': '56',
'Content-Type': 'application/json;charset=UTF-8',
'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Origin': 'https://zzcx.cs.com.cn',
'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload = json.dumps(payload)
result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json)
pages = result_json['data']['pages']
for page in range(1, int(pages + 1)):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers)
records = datas.json()['data']['records']
for news in records:
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
if __name__ == "__main__":
zzcx()
\ No newline at end of file
......@@ -85,7 +85,8 @@ class ClassTool():
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate']
'发布时间': dic_news['publishDate'],
'标题': dic_news['title']
}
self.db_storage.insert_one(aaa_dic)
......
......@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
#
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import datetime
import json
import requests
headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
# res = requests.get(url).text[1:-1]
res = requests.get(url=url, headers=headers)
with open('./a.pdf','wb') as f:
f.write(res.content)
\ No newline at end of file
import pymongo
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0504']
datas = db_storage.find({'postCode':'2'}).limit(5)
for data in datas:
title = data['titleForeign']
contentWithTag = data['richTextForeign']
summary = data['contentForeign']
dic_info = {
'title':title,
'summary':summary,
'contentWithTag':contentWithTag
}
headers = {
'Content-Type': 'application/json',
}
dic_info_ = json.dumps(dic_info)
# print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论