提交 862e97ab 作者: 薛凌堃

1/31

上级 1d1053c8
...@@ -2,32 +2,99 @@ ...@@ -2,32 +2,99 @@
#先采集天眼查id,再通过id采集核心人员信息 #先采集天眼查id,再通过id采集核心人员信息
import datetime import datetime
import json import json
import os
import subprocess
import sys
import requests,time,random import requests,time,random
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib3 import urllib3
from retry import retry
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
baseCore = BaseCore() baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger() log = baseCore.getLogger()
headers = { headers = {
'Cookie':'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22307016917%22%2C%22first_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22307016917%22%7D%2C%22%24device_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%7D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213592481839%22%7D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412', 'Cookie': 'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103126138%22%2C%22userId%22:%22304029617%22}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=%255B%252217103126138%2522%255D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22304029617%22%2C%22first_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22304029617%22%7D%2C%22%24device_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529',
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
} }
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = [] list_all_1 = []
list_all_2 = [] list_all_2 = []
taskType = '天眼查/核心人员' taskType = '天眼查/核心人员'
ip_num = 0
def get_proxy(ip_num):
sql = "select proxy from clb_proxy"
cursor_.execute(sql)
proxy_lists = cursor_.fetchall()
cnx_.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[ip_num]
@retry(tries=3, delay=1)
def get_html(tycid, ip_num):
url = f"https://www.tianyancha.com/company/{tycid}"
ip = get_proxy(ip_num)
response = requests.get(url=url, headers=headers, proxies=ip)
if response.status_code == 200:
pass
else:
ip_num += 1
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
except:
return 0
@retry(tries=3, delay=1)
def get_page(url, ip_num):
ip = get_proxy(ip_num)
res = requests.get(url=url, headers=headers, proxies=ip)
if res.status_code == 200:
pass
else:
ip_num += 1
raise
time.sleep(1)
total_page_ = res.json()['data']['total']
return total_page_
def doJob(): def doJob():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
# social_code = '9135020056842712XB' social_code = '91320691550279691N'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
...@@ -35,15 +102,29 @@ def doJob(): ...@@ -35,15 +102,29 @@ def doJob():
try: try:
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
if len(data) != 0: if len(data) != 0:
pass id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else: else:
#数据重新塞入redis #数据重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code) # log.info(f'数据库中无该企业{social_code}')
continue sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
id = data[0] cursor.execute(sql)
com_name = data[1] data = cursor.fetchone()
xydm = data[2] id = data[0]
tycid = data[11] com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
cursor_.execute(insert, (com_name, xydm, social_code))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(com_name) retData = getTycIdByXYDM(com_name)
...@@ -58,28 +139,111 @@ def doJob(): ...@@ -58,28 +139,111 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====') log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue continue
except: except:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员") log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = [] list_one_info = []
num = 1 num = 1
for page in range(1,2):
t = int(time.time()*1000) #todo:先确定接口走哪个
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1 try:
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}' charge = get_html(tycid, ip_num)
ip = baseCore.get_proxy() except Exception as e:
# res = requests.get(url,headers=headers,proxies=ip) # ,verify=False charge = -1
res = requests.get(url,headers=headers) # ,verify=False log.info(e)
time.sleep(1) total_page = 0
list_all = res.json()['data']['dataList'] t = int(time.time() * 1000)
if charge == -1:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, ip_num)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, ip_num)
except:
total_page2 = 0
time.sleep(2)
try:
total_page3 = get_page(url3, ip_num)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
continue
if total_page == 0:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
#todo:获取页数
time.sleep(2)
for page in range(1, int((total_page/20) + 1)+1):
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers, proxies=ip) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all: if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
sex = one_info['sex'] sex = one_info['sex']
...@@ -135,131 +299,95 @@ def doJob(): ...@@ -135,131 +299,95 @@ def doJob():
num = num+1 num = num+1
list_one_info.append(dic_json) list_one_info.append(dic_json)
# list_all_2.append(dic_json_img) # list_all_2.append(dic_json_img)
else: elif flag == 3:
t = int(time.time() * 1000) for one_info in list_all:
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}' name = one_info['personal_name']
ip = baseCore.get_proxy() try:
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False sex = one_info['gender2']
res = requests.get(url, headers=headers) # ,verify=False except:
time.sleep(1) sex = ''
list_all = res.json()['data']['dataList'] education = ''
if list_all: position = one_info['position_name']
for one_info in list_all: Salary = ''
name = one_info['personal_name'] try:
try: birthYear = one_info['year_of_birth']
sex = one_info['gender2'] except:
except:
sex = ''
education = ''
position = one_info['position_name']
Salary = ''
birthYear = '' birthYear = ''
personInfo = one_info['resume_cn'] personInfo = one_info['resume_cn']
dic_json = { timestamp = int(int(one_info['employ_date'])/10000)
"socialCreditCode": social_code, currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
"name": name, dic_json = {
"sex": sex, "socialCreditCode": social_code,
"education": education, "name": name,
"position": position, "sex": sex,
"salary": Salary, "education": education,
"birthYear": birthYear, "position": position,
"shareNum": '', "salary": Salary,
"shareRatio": '', "birthYear": birthYear,
"benefitShare": '', "shareNum": '',
"currentTerm": '', "shareRatio": '',
"personInfo": personInfo, "benefitShare": '',
"sort": str(num) "currentTerm": currentTerm+'至-',
} "personInfo": personInfo,
num = num + 1 "sort": str(num)
list_one_info.append(dic_json) }
else: num = num + 1
t = int(time.time() * 1000) list_one_info.append(dic_json)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}' else:
ip = baseCore.get_proxy() for one_info in list_all:
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False name = one_info['name']
res = requests.get(url, headers=headers) # ,verify=False try:
time.sleep(1) position = one_info['typeSore']
list_all = res.json()['data']['result'] except:
# todo:增加一种情况 position = ''
if list_all:
for one_info in list_all:
name = one_info['name']
try:
sex = one_info['sex']
except:
sex = ''
try:
education = one_info['education']
except:
education = ''
try:
position = one_info['typeSore']
except:
position = ''
try:
Salary = one_info['salary']
except:
Salary = ''
birthYear = ''
try:
shareRatio = one_info['percent']
except:
shareRatio = ''
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
try:
currentTerm = one_info['term']
except:
currentTerm = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info) json_updata = json.dumps(list_one_info)
if json_updata == '[]': if json_updata == '[]':
continue continue
...@@ -272,7 +400,7 @@ def doJob(): ...@@ -272,7 +400,7 @@ def doJob():
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====') log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e) log.info(e)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
......
...@@ -7,6 +7,8 @@ import requests,time,random ...@@ -7,6 +7,8 @@ import requests,time,random
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib3 import urllib3
from retry import retry
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
baseCore = BaseCore() baseCore = BaseCore()
...@@ -19,77 +21,207 @@ headers = { ...@@ -19,77 +21,207 @@ headers = {
} }
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = [] list_all_1 = []
list_all_2 = [] list_all_2 = []
taskType = '天眼查/核心人员' taskType = '天眼查/核心人员'
requests.adapters.DEFAULT_RETRIES = 5 ip_num = 0
@retry(tries=3, delay=1)
def get_html(tycid):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
except:
return 0
@retry(tries=3, delay=1)
def get_page(url):
# ip = baseCore.get_proxy()
res = requests.get(url=url, headers=headers)
time.sleep(1)
total_page_ = res.json()['data']['total']
return total_page_
def doJob(): def doJob():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
social_code = '91510000207312079C' # social_code = '91320691550279691N'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
if 'ZZSN' in social_code:
continue
start = time.time() start = time.time()
try: try:
# data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
# if len(data) != 0: if len(data) != 0:
# pass id = data[0]
# else: com_name = data[1]
# #数据重新塞入redis xydm = data[2]
# baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code) tycid = data[11]
# continue count = data[17]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
cursor_.execute(insert, (com_name, xydm, social_code))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# id = data[0] # id = data[0]
# com_name = data[1]
# xydm = data[2] # xydm = data[2]
tycid = '' # tycid = data[11]
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(social_code) retData = getTycIdByXYDM(com_name)
if retData['state']: if retData['state']:
tycid = retData['tycData']['id'] tycid = retData['tycData']['id']
# todo:写入数据库 # # todo:写入数据库
# updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'" updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
# cursor_.execute(updateSql) cursor_.execute(updateSql)
# cnx_.commit() cnx_.commit()
else: else:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====') log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue continue
except Exception as e: except:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue continue
# count = data[17] count = data[17]
log.info(f"---{social_code}----{tycid}----开始采集核心人员") log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = [] list_one_info = []
num = 1 num = 1
for page in range(1,2):
t = int(time.time()*1000) # todo:先确定接口走哪个
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1 try:
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}' charge = get_html(tycid)
ip = baseCore.get_proxy() except Exception as e:
res = requests.get(url,headers=headers,proxies=ip,verify=False) charge = -1
log.info(e)
time.sleep(2)
t = int(time.time() * 1000)
if charge == -1:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2)
except:
total_page2 = 0
time.sleep(1) time.sleep(1)
list_all = res.json()['data']['dataList'] try:
total_page3 = get_page(url3)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
continue
if total_page == 0:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
break
# todo:获取页数
for page in range(1, int((total_page / 20) + 1) + 1):
for c in range(3):
# ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all: if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
sex = one_info['sex'] sex = one_info['sex']
education = one_info['education'] education = one_info['education']
position = one_info['position'] position = one_info['position']
Salary = one_info['salary'] Salary = one_info['salary']
#todo:获取当前年份 # todo:获取当前年份
now = datetime.datetime.now() now = datetime.datetime.now()
year = now.year year = now.year
try: try:
...@@ -105,183 +237,149 @@ def doJob(): ...@@ -105,183 +237,149 @@ def doJob():
except: except:
person_img = '--' person_img = '--'
dic_json = { dic_json = {
"socialCreditCode":social_code, "socialCreditCode": social_code,
"name":name, "name": name,
"sex":sex, "sex": sex,
"education":education, "education": education,
"position":position, "position": position,
"salary":Salary, "salary": Salary,
"birthYear":birthYear, "birthYear": birthYear,
"shareNum":StockKeepings, "shareNum": StockKeepings,
"shareRatio":'', "shareRatio": '',
"benefitShare":'', "benefitShare": '',
"currentTerm":currentTerm, "currentTerm": currentTerm,
"personInfo":personInfo, "personInfo": personInfo,
"sort":str(num) "sort": str(num)
} }
dic_json_img = { dic_json_img = {
"socialCreditCode":social_code, "socialCreditCode": social_code,
"name":name, "name": name,
"sex":sex, "sex": sex,
"education":education, "education": education,
"position":position, "position": position,
"salary":Salary, "salary": Salary,
"birthYear":birthYear, "birthYear": birthYear,
"shareNum":StockKeepings, "shareNum": StockKeepings,
"shareRatio":'', "shareRatio": '',
"benefitShare":'', "benefitShare": '',
"currentTerm":currentTerm, "currentTerm": currentTerm,
"personInfo":personInfo, "personInfo": personInfo,
"头像":person_img, "头像": person_img,
"sort":str(num) "sort": str(num)
} }
num = num+1 num = num + 1
list_one_info.append(dic_json) list_one_info.append(dic_json)
# list_all_2.append(dic_json_img) # list_all_2.append(dic_json_img)
else: elif flag == 3:
t = int(time.time() * 1000) for one_info in list_all:
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}' name = one_info['personal_name']
ip = baseCore.get_proxy() try:
res = requests.get(url, headers=headers, proxies=ip, verify=False) sex = one_info['gender2']
time.sleep(1) except:
list_all = res.json()['data']['dataList'] sex = ''
if list_all: education = ''
for one_info in list_all: position = one_info['position_name']
name = one_info['personal_name'] Salary = ''
try: try:
sex = one_info['gender2'] birthYear = one_info['year_of_birth']
except: except:
sex = ''
education = ''
position = one_info['position_name']
Salary = ''
birthYear = '' birthYear = ''
personInfo = one_info['resume_cn'] personInfo = one_info['resume_cn']
dic_json = { timestamp = int(int(one_info['employ_date']) / 10000)
"socialCreditCode": social_code, currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
"name": name, dic_json = {
"sex": sex, "socialCreditCode": social_code,
"education": education, "name": name,
"position": position, "sex": sex,
"salary": Salary, "education": education,
"birthYear": birthYear, "position": position,
"shareNum": '', "salary": Salary,
"shareRatio": '', "birthYear": birthYear,
"benefitShare": '', "shareNum": '',
"currentTerm": '', "shareRatio": '',
"personInfo": personInfo, "benefitShare": '',
"sort": str(num) "currentTerm": currentTerm + '至-',
} "personInfo": personInfo,
num = num + 1 "sort": str(num)
list_one_info.append(dic_json) }
else: num = num + 1
t = int(time.time() * 1000) list_one_info.append(dic_json)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}' else:
ip = baseCore.get_proxy() for one_info in list_all:
res = requests.get(url, headers=headers, proxies=ip, verify=False) name = one_info['name']
time.sleep(1)
try: try:
list_all = res.json()['data']['result'] position = one_info['typeSore']
except Exception as e: except:
log.info(res.json()) position = ''
continue
# todo:增加一种情况
if list_all:
for one_info in list_all:
name = one_info['name']
try:
sex = one_info['sex']
except:
sex = ''
try:
education = one_info['education']
except:
education = ''
try:
position = one_info['typeSore']
except:
position = ''
try:
Salary = one_info['salary']
except:
Salary = ''
birthYear = ''
try:
shareRatio = one_info['percent']
except:
shareRatio = ''
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
try:
currentTerm = one_info['term']
except:
currentTerm = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_res = requests.get(person_url, headers=headers, proxies=ip)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info) json_updata = json.dumps(list_one_info)
if json_updata == '[]': if json_updata == '[]':
log.indo(f'---{social_code}---无高管信息---')
continue continue
else: else:
pass pass
response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False) response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text) print(response.text)
log.info('=========成功======') log.info('=========成功======')
except Exception as e: except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====') log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
# break # break
# df_img = pd.DataFrame(list_all_2) # df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False) # df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header): ...@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break break
except Exception as e: except Exception as e:
time.sleep(3) time.sleep(3)
log.info(e)
continue continue
if page_size < 1: if page_size < 1:
...@@ -206,7 +207,8 @@ def download(data, order_by,header): ...@@ -206,7 +207,8 @@ def download(data, order_by,header):
come = data['come'] come = data['come']
except: except:
come = '' come = ''
if publishDate < '2024-01-29':
return
tf_url = add_check_url(sourceAddress) tf_url = add_check_url(sourceAddress)
if tf_url: if tf_url:
dic_result = { dic_result = {
...@@ -1726,12 +1728,12 @@ if __name__ == '__main__': ...@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku() # qianyanzhishiku()
# except Exception as e: # except Exception as e:
# pass # pass
try: # try:
log.info('shijiejingjiluntan') # log.info('shijiejingjiluntan')
shijiejingjiluntan() # shijiejingjiluntan()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
# try: # try:
# log.info('dongfangcaifu') # log.info('dongfangcaifu')
# dongfangcaifu() # dongfangcaifu()
...@@ -1749,31 +1751,31 @@ if __name__ == '__main__': ...@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
# try: try:
# log.info('dongfangcaifu4') log.info('dongfangcaifu4')
# dongfangcaifu4() dongfangcaifu4()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu5') log.info('dongfangcaifu5')
# dongfangcaifu5() dongfangcaifu5()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu6') log.info('dongfangcaifu6')
# dongfangcaifu6() dongfangcaifu6()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu7') log.info('dongfangcaifu7')
# dongfangcaifu7() dongfangcaifu7()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
import requests
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
def two_dfsm_mtgc():
info_list = []
"""
地方扫描
"""
url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for url in url_list:
res = requests.get(url=url,headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
# for page in range(378,int(pages)+1):
for page in range(1,378):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else:
url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0019-test', href)
if flag:
log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except Exception as e:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result =result.find('p').text
pub_source = result.split('发布时间:')[0].replace('文章来源:','').strip()
pub_time = result.split('发布时间:')[1]
# print(pub_source,pub_time)
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian').find_all('span')
try:
pub_source = str(result[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source =''
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
info_code = 'IN-20240129-0019'
result_dict = {
'id': '',
'sid': '1751849444877144065',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
two_dfsm_mtgc()
\ No newline at end of file
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
#国资要闻
def gzyw():
info_list = []
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
# pages = soup.find('td',id='pag_4278129')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
for page in range(1, int(pages)+1):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0002-test', href)
if flag:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except Exception as e:
continue
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result_ =result.find('p').text
pub_source = result_.split('发布时间:')[0].replace('文章来源:', '').strip()
pub_time = result_.split('发布时间:')[1]
# print(pub_source,pub_time)
if title == '':
result.find('p').decompose()
title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian')
if result:
result_ = result.find_all('span')
try:
pub_source = str(result_[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result_[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result_[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source = ''
if title == '':
result.find('p').decompose()
title = result.text.strip()
contentWithTag = str(i_soup.find(class_='articlecontent'))
content = str(i_soup.find(class_='articlecontent').text)
else:
result = i_soup.find(class_='pages-date')
pub_source = result.find('span').text.replace('来源:', '').strip()
pub_time = result.text
pub_time = pub_time.split('来源')[0].strip()
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='pages_content').text)
# content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code = 'IN-20240129-0002'
result_dict = {
'id':'',
'sid':'1751810519211053058',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source':16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
gzyw()
\ No newline at end of file
"""
中证智能财讯
"""
import json
import requests
from bs4 import BeautifulSoup
def zzcx():
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Length': '56',
'Content-Type': 'application/json;charset=UTF-8',
'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Origin': 'https://zzcx.cs.com.cn',
'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload = json.dumps(payload)
result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json)
pages = result_json['data']['pages']
for page in range(1, int(pages + 1)):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers)
records = datas.json()['data']['records']
for news in records:
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
if __name__ == "__main__":
zzcx()
\ No newline at end of file
...@@ -85,7 +85,8 @@ class ClassTool(): ...@@ -85,7 +85,8 @@ class ClassTool():
'来源': dic_news['labels'][0]['relationName'], '来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'], '创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100], '带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate'] '发布时间': dic_news['publishDate'],
'标题': dic_news['title']
} }
self.db_storage.insert_one(aaa_dic) self.db_storage.insert_one(aaa_dic)
......
...@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore ...@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
# #
# code = use_ocr(out_img_path) # code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code) # 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import datetime
import json
import requests import requests
headers = { import pymongo
# 'Accept': '*/*', from base import BaseCore
# 'Accept-Encoding': 'gzip, deflate, br', baseCore = BaseCore.BaseCore()
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8', log = baseCore.getLogger()
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com', db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
# 'Pragma': 'no-cache', '数据源_0504']
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors', datas = db_storage.find({'postCode':'2'}).limit(5)
# 'Sec-Fetch-Site': 'same-site', for data in datas:
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', title = data['titleForeign']
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"', contentWithTag = data['richTextForeign']
# 'sec-ch-ua-mobile': '?0', summary = data['contentForeign']
# 'sec-ch-ua-platform': '"Windows"' dic_info = {
} 'title':title,
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D" 'summary':summary,
'contentWithTag':contentWithTag
}
# res = requests.get(url).text[1:-1] headers = {
res = requests.get(url=url, headers=headers) 'Content-Type': 'application/json',
with open('./a.pdf','wb') as f: }
f.write(res.content) dic_info_ = json.dumps(dic_info)
\ No newline at end of file # print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论