提交 4c5b1a70 作者: XveLingKun

0906

上级 255f8c19
...@@ -2,6 +2,13 @@ ...@@ -2,6 +2,13 @@
<project version="4"> <project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false"> <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData> <serverData>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 (2)"> <paths name="root@114.115.141.81:22 (2)">
<serverdata> <serverdata>
<mappings> <mappings>
...@@ -16,6 +23,13 @@ ...@@ -16,6 +23,13 @@
</mappings> </mappings>
</serverdata> </serverdata>
</paths> </paths>
<paths name="root@114.116.49.86:22 (2)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.54.108:22"> <paths name="root@114.116.54.108:22">
<serverdata> <serverdata>
<mappings> <mappings>
......
...@@ -7,16 +7,17 @@ import pymongo ...@@ -7,16 +7,17 @@ import pymongo
from bson import ObjectId from bson import ObjectId
from openpyxl import Workbook, load_workbook from openpyxl import Workbook, load_workbook
from base.BaseCore import BaseCore import sys
sys.path.append('../../base')
baseCore = BaseCore() import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息'] '天眼查登录信息']
db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'股东信息0621'] '最大股东信息0902']
class File(): class File():
...@@ -160,10 +161,12 @@ class Info(): ...@@ -160,10 +161,12 @@ class Info():
def update_holder(self, no, dic_info): def update_holder(self, no, dic_info):
db_storage2.update_one({'序号': str(no)}, {'$set': {'最大持股名称': dic_info['最大持股名称'], '持股比例': dic_info['持股比例'], '企业标签': dic_info['企业标签']}}) db_storage2.update_one({'序号': str(no)}, {'$set': {'最大持股名称': dic_info['最大持股名称'], '持股比例': dic_info['持股比例'], '企业标签': dic_info['企业标签']}})
pass pass
def update_info(self, no, dic_info): def update_info(self, no, dic_info):
db_storage2.update_one({'序号': str(no)}, { db_storage2.update_one({'序号': str(no)}, {
'$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}}) '$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
pass pass
def insert_into(self, dic_info): def insert_into(self, dic_info):
if dic_info['股东序号序号']: if dic_info['股东序号序号']:
...@@ -179,6 +182,16 @@ class Info(): ...@@ -179,6 +182,16 @@ class Info():
print(result) print(result)
pass pass
def bigshearholder_insert(self,dic_info):
insertion_result = db_storage2.insert_one(dic_info)
inserted_id = insertion_result.inserted_id
return inserted_id
def bigupdate_info(self, no, dic_info):
db_storage2.update_one({'企业信用代码(中国内地企业需填写信用代码)': str(no)}, {
'$set': {'最大持股企业信用代码': dic_info['最大持股企业信用代码'], '最大持股企业标签': dic_info['最大持股企业标签']}})
pass
from selenium import webdriver from selenium import webdriver
class Driver(): class Driver():
......
...@@ -26,7 +26,7 @@ if __name__ == "__main__": ...@@ -26,7 +26,7 @@ if __name__ == "__main__":
name = input('所属用户:') name = input('所属用户:')
driver = create_driver() driver = create_driver()
driver.get(url) driver.get(url)
time.sleep(60) time.sleep(80)
cookies = driver.get_cookies() cookies = driver.get_cookies()
# print(driver.get_cookies()) # print(driver.get_cookies())
......
...@@ -180,32 +180,26 @@ def doJob(): ...@@ -180,32 +180,26 @@ def doJob():
continue continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('shareHolderInfo') item = baseCore.redicPullData('shareHolderInfo')
# item = '900|微创心律管理|None|罗七一|健康科技|¥ 90 亿|¥ 90 亿|¥ 92 亿|823|861|911|ZZSN231108150127681|MicroPort Cardiac Rhythm Management International Limited|中国|None' # item = '91310115MA1HB3LY4M|上海商汤科技开发有限公司'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C' # social_code = '91110108780992804C'
if item == None: if item == None:
time.sleep(30 * 60) time.sleep(30 * 60)
continue continue
start = time.time() start = time.time()
no = item.split('|')[0] # no = item.split('|')[0]
social_code = item.split('|')[11] # social_code = item.split('|')[11]
social_code = item.split('|')[0]
com_name = item.split('|')[1]
recept_name = item.split('|')[12] # recept_name = item.split('|')[12]
dic_info = {"序号": item.split('|')[0], dic_info = {"序号": item.split('|')[0],
"企业名称(榜单公布)": item.split('|')[1], "企业信用代码(中国内地企业需填写信用代码)": social_code,
"企业别称": item.split('|')[2], "企业名称(企查查/天眼查)": com_name
"门人/联合创始": item.split('|')[3],
"行业": item.split('|')[4],
"企业估值(2022年)": item.split('|')[5],
"企业估值(2023年)": item.split('|')[6],
"企业估值(2024年)": item.split('|')[7],
"2022年独角兽排名": item.split('|')[8],
"2023年独角兽排名": item.split('|')[9],
"2024年独角兽排名": item.split('|')[10],
"企业信用代码(中国内地企业需填写信用代码)": item.split('|')[11],
"企业名称(企查查)": item.split('|')[12],
"所属国家": item.split('|')[13]
} }
"""
最大持股企业、最大持股企业原文名称、最大持股企业所属国家、持股比例、最大持股企业信用代码、最大持股企业标签
"""
if "ZZSN" in social_code: if "ZZSN" in social_code:
dic_info['前十大股东名称'] = '' dic_info['前十大股东名称'] = ''
dic_info['持股比例'] = '' dic_info['持股比例'] = ''
...@@ -244,7 +238,7 @@ def doJob(): ...@@ -244,7 +238,7 @@ def doJob():
tycid = '' tycid = ''
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(recept_name, s) retData = getTycIdByXYDM(com_name, s)
# retData = getTycIdByXYDM("极星汽车销售有限公司", s) # retData = getTycIdByXYDM("极星汽车销售有限公司", s)
if retData['state']: if retData['state']:
tycid = retData['tycData']['id'] tycid = retData['tycData']['id']
......
"""采集最大股东信息的相关信息"""
import json
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from getTycId import getTycIdByXYDM
import sys
sys.path.append('../../base')
import BaseCore
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/股东信息'
from classtool import Token, Info
token = Token()
Info = Info()
@retry(tries=3, delay=1)
def get_html(tycid, driver, dic_info):
url = f"https://www.tianyancha.com/company/{tycid}"
driver.get(url=url)
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
xydm = soup.find('span', attrs={'class': 'index_detail-credit-code__fH1Ny'}).text
dic_info['最大持股企业信用代码'] = xydm
script = soup.find('script', attrs={'id': '__NEXT_DATA__'}).text
script = json.loads(script)
script = script['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']['tagListV2']
tag_list = []
filter_list = ['存续', '曾用名', '竞争风险', '司法案件', '合作风险', '股权出质', '仍注册']
for tag in script:
if tag['title'] in filter_list:
continue
if tag['color'] == '#FF463C':
continue
tag_list.append(tag['title'])
dic_info['最大持股企业标签'] = tag_list
return dic_info
@retry(tries=5, delay=3)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
if res.status_code != 200:
raise
data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_, data_page
@retry(tries=5, delay=3)
def get_page1(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
if res.status_code != 200:
raise
data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try:
total_page_ = data_page['data']['stockHolder']['total']
except:
raise
return total_page_, data_page
@retry(tries=5, delay=3)
def post_page(url, s, headers, payload):
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=json.dumps(payload), proxies=ip, timeout=(5, 10))
if res.status_code != 200:
raise
json_info = res.json()
try:
total_page_ = json_info['data']['total']
except:
raise
return total_page_, json_info
from selenium import webdriver
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def login(driver):
cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(3)
driver.refresh()
time.sleep(3)
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
return driver, id_cookie, s
def doJob():
# for social_code in social_code_list:
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
for i in range(1000):
# while True:
# todo:设置cookies的使用
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
driver, id_cookie, s = login(driver)
if id_cookie:
pass
else:
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('BigShareHolder:comname')
dic_info = {}
# item = '91310115MA1HB3LY4M|上海阡伦科技有限公司|3476165132'
# 判断 如果Redis中已经没有数据,则等待
# Big_item = inserted_id + "|" + shareHolderName + "|" + big_tycid
if item == None:
time.sleep(30 * 60)
continue
start = time.time()
no = item.split('|')[0]
# todo:根据信用代码找到该条信息
tycid = item.split('|')[2]
com_name = item.split('|')[1]
try:
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name, s)
# retData = getTycIdByXYDM("极星汽车销售有限公司", s)
if retData['state']:
tycid = retData['tycData']['id']
xydm = retData['tycData']['taxCode']
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(com_name, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={com_name}====重新放入redis====')
baseCore.rePutIntoR('BigShareHolder:Error', item)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(com_name, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('BigShareHolder:Error', item)
continue
log.info(f"---{com_name}----{tycid}----开始采集股东信息")
try:
dic_info = get_html(tycid, driver, dic_info)
charge = 0
# 页面请求三次都失败
except:
charge = -1
if charge == -1:
token.updateTokeen(id_cookie, 3)
# 重新塞入redis
baseCore.rePutIntoR('BigShareHolder:comname', item)
log.info(f"---{com_name}----{tycid}----请求失败----重新放入redis")
time.sleep(3)
continue
else:
t = int(time.time() * 1000)
Info.bigupdate_info(no, dic_info)
except Exception as e:
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={com_name}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('BigShareHolder:comname', item)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(com_name, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
doJob()
\ No newline at end of file
...@@ -74,6 +74,7 @@ if __name__ == "__main__": ...@@ -74,6 +74,7 @@ if __name__ == "__main__":
# loadinfo = [token,cookies] # loadinfo = [token,cookies]
# 保存到数据库中 # 保存到数据库中
# insert = f"insert into weixin_tokenCookies_person (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
insert = f"insert into weixin_tokenCookies (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())" insert = f"insert into weixin_tokenCookies (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
cursor_.execute(insert) cursor_.execute(insert)
cnx_.commit() cnx_.commit()
......
import pandas as pd import pandas as pd
...@@ -2,20 +2,20 @@ import pandas as pd ...@@ -2,20 +2,20 @@ import pandas as pd
import pymongo import pymongo
# 7649 # 7649
data_list = [] data_list = []
db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['新华丝路-丝路商机100+'] db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='shencai', password='shencai_zzsn008').ZZSN['国务院问答对']
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}}) # datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# 导出标签是空的数据 # 导出标签是空的数据
datas = db_stroage.find() datas = db_stroage.find()
link = [] link = []
for data in datas: for data in datas:
del data['_id'] del data['_id']
del data['id'] # del data['id']
# if data['标题'] not in link: if data['问题']:
# data_list.append(data) data_list.append(data)
# link.append(data['标题']) else:
data_list.append(data) continue
# print(data) # print(data)
print(len(data_list)) print(len(data_list))
df = pd.DataFrame(data_list) df = pd.DataFrame(data_list)
df.to_excel('./新华丝路-丝路投资2.xlsx',index=False) df.to_excel('./国务院问答对.xlsx',index=False)
\ No newline at end of file \ No newline at end of file
# 读取表中的数据,转化成list # 读取表中的数据,转化成list
...@@ -44,7 +44,8 @@ def getrequest(href, headers): ...@@ -44,7 +44,8 @@ def getrequest(href, headers):
def classify_report_type(title): def classify_report_type(title):
if "年年度报告" in title or re.match(r'\d{4}年度报告', title): type_pattern = r'(.*?)\d{4}年?(年度财务报告|年报|年度报告)'
if "年年度报告" in title or re.match(type_pattern, title):
return "年度报告" return "年度报告"
elif "半年" in title: elif "半年" in title:
return "半年度报告" return "半年度报告"
...@@ -95,15 +96,16 @@ def parase(com_name, social_code, dataJson): ...@@ -95,15 +96,16 @@ def parase(com_name, social_code, dataJson):
"报告年份": year "报告年份": year
} }
db_storage2.insert_one(dic_info) db_storage2.insert_one(dic_info)
time.sleep(1) time.sleep(2)
if __name__ == "__main__": if __name__ == "__main__":
dataList = getcomlist(file_path, sheet_name) dataList = getcomlist(file_path, sheet_name)
# print(dataList) # print(dataList)
for item in enumerate(dataList): for item in enumerate(dataList):
social_code = item[1] # print(item)
com_name = item[2] social_code = item[1][1]
com_name = item[1][2]
print(f"正在采集:{com_name}") print(f"正在采集:{com_name}")
href = url.format(com_name, 1) href = url.format(com_name, 1)
dataJson = getrequest(href, headers) dataJson = getrequest(href, headers)
...@@ -116,5 +118,5 @@ if __name__ == "__main__": ...@@ -116,5 +118,5 @@ if __name__ == "__main__":
href_page = url.format(com_name, page) href_page = url.format(com_name, page)
dataJson_page = getrequest(href_page, headers) dataJson_page = getrequest(href_page, headers)
parase(com_name, social_code, dataJson_page) parase(com_name, social_code, dataJson_page)
time.sleep(2) time.sleep(5)
++ "b/\345\233\275\345\212\241\351\231\242\351\227\256\347\255\224\345\257\271\345\244\204\347\220\206/qa\351\200\211\347\231\273.py"
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论