提交 a97764ed 作者: 薛凌堃

9/14

上级 495275b6
...@@ -339,7 +339,7 @@ if __name__ == '__main__': ...@@ -339,7 +339,7 @@ if __name__ == '__main__':
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
log.info(f'----当前企业{social_code}--开始处理---') log.info(f'----当前企业{social_code}--开始处理---')
count = dic_info[13] count = dic_info[14]
com_name = dic_info[1] com_name = dic_info[1]
social_code = dic_info[2] social_code = dic_info[2]
#企查查id #企查查id
......
...@@ -66,8 +66,8 @@ def getTycIdByXYDM(xydm): ...@@ -66,8 +66,8 @@ def getTycIdByXYDM(xydm):
def updateTycInfo(): def updateTycInfo():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
social_code = '9111000066990444XF' # social_code = '9111000066990444XF'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
...@@ -88,7 +88,7 @@ def updateTycInfo(): ...@@ -88,7 +88,7 @@ def updateTycInfo():
try: try:
retData = getTycIdByXYDM(xydm) retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']: if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id'] tycid = retData['id']
# todo:写入数据库 # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'" updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql) cursor_.execute(updateSql)
......
import json
import redis
from bs4 import BeautifulSoup
import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
# updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# # print(updateBeginSql)
# cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
# cnx_.commit()
import time
# from getTycId import getTycIdByXYDM
# social_code = '91440101231247350J'
# data = baseCore.getInfomation(social_code)
# tycid = data[11]
# if tycid == None:
# print(data)
# retData = getTycIdByXYDM(social_code)
# tycid = retData['tycData']['id']
# print(tycid)
# time_struct = time.localtime(int(1692762780000 / 1000)) # 首先把时间戳转换为结构化时间
# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间
# print(time_format)
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
# #原键名
# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode'
# #目标键名
# key2 = 'NewsEnterpriseFbs:gnqy_socialCode'
# values = r.lrange(key1,0,-1)
# for value in values:
# r.rpush(key2, value)
#
# # 关闭Redis连接
# r.close()
list_all = []
if list_all:
print(len(list_all))
else:
print('---')
"""
雪球网财务数据 根据接口
"""
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
import datetime
from selenium import webdriver
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
def getFormatedate(timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
def check_code(com_code):
r = redis.Redis(host="114.115.236.206",port=6379,password='clbzzsn',db=3)
res = r.exists('com_xqcwsj_code::'+com_code)
if res:
return False
else:
return True
def check_date(com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_xqcwsj_code::'+com_code, info_date) # 注意是 保存set的方式
if res:
return True
else:
return False
# 将采集后的股票代码对应的报告期保存进redis
def add_date(com_code,report_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
res = r.sadd('com_xqcwsj_code::'+com_code,report_date)
def getrequests(url):
req = requests.get(url=url, headers=headers)
data = req.json()
a_infoData = data['data']['list']
return a_infoData
def getdriver(url_name):
# 使用模拟浏览器打开
browser.get(url_name)
time.sleep(3)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')
return soup
def getdetail(reportInfodata,name_map,listinfo,url_name):
# # 报告期
# report_date = reportInfodata['report_date']
# report_date = getFormatedate(int(report_date / 1000))
#模拟浏览器请求
soup = getdriver(url_name)
time.sleep(2)
# 利润表
table = soup.find('div', class_='tab-table-responsive')
list_tr = table.find_all('tr')
for tr in list_tr[1:]:
info_name = tr.find('td', colspan='2').text
# 营业总收入
try:
info_enname = name_map[info_name]
info_data = reportInfodata[info_enname][0]
if info_data is None:
info_data = '--'
except:
info_enname = '--'
info_data = '--'
dic_info = {
"name": info_name,
'enName': info_enname,
"value": info_data
}
listinfo.append(dic_info)
return listinfo
def getinfo(com_code):
dic_info = {}
for nnn in range(0, 3):
try:
ynFirst = check_code(com_code)
break
except:
time.sleep(1)
#'https://stock.xueqiu.com/v5/stock/finance/cn/balance.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694508688637'
url_lrb = f'https://stock.xueqiu.com/v5/stock/finance/cn/income.json?symbol={com_code}&type=all&is_detail=true&count=5&timestamp=1694414063178'
url_zcfzb = f'https://stock.xueqiu.com/v5/stock/finance/cn/balance.json?symbol={com_code}&type=all&is_detail=true&count=5&timestamp=1694508688637'
url_xjllb = f'https://stock.xueqiu.com/v5/stock/finance/cn/cash_flow.json?symbol={com_code}&type=all&is_detail=true&count=5&timestamp=1694512695956'
lrb_name = f'https://xueqiu.com/snowman/S/{com_code}/detail#/GSLRB'
zcfzb_name = f'https://xueqiu.com/snowman/S/{com_code}/detail#/ZCFZB'
xjllb_name = f'https://xueqiu.com/snowman/S/{com_code}/detail#/XJLLB'
a_infoData = getrequests(url_lrb)
b_infoData = getrequests(url_zcfzb)
c_infoData = getrequests(url_xjllb)
#对报告期做循环
for i in range(0,5):
listLrb = []
listZcfzb = []
listXjllb = []
reportLrbdata = a_infoData[i]
report_date = a_infoData[i]['report_date']
#时间戳转化为日期
report_date = getFormatedate(int(report_date / 1000))
# 检查报告期是否已经存在
for nnn in range(0, 3):
try:
panduan = check_date(com_code, report_date)
if panduan:
return dic_info
else:
pass
break
except:
time.sleep(1)
log.info(f'======正在采集:{report_date}=======')
#利润表
list_Lrb = getdetail(reportLrbdata,lrb_name_map,listLrb,lrb_name)
print(list_Lrb)
#资产负债表
reportZcfzbdata = b_infoData[i]
list_Zcfzb = getdetail(reportZcfzbdata,zcfzb_name_map,listZcfzb,zcfzb_name)
#现金流量表
reportXjllbdata = c_infoData[i]
list_Xjllb = getdetail(reportXjllbdata,xjllb_name_map,listXjllb,xjllb_name)
dic_info = {
"socialCreditCode": social_code,
"securitiesCode": com_code[2:],
"date": report_date,
"debt": list_Zcfzb,
"profit": list_Lrb,
"cash": list_Xjllb,
"ynFirst": ynFirst,
}
print(dic_info)
#一个报告期结束
for nnn in range(0, 3):
try:
add_date(com_code, report_date)
break
except:
time.sleep(1)
log.info(f'----{com_code}--{report_date}----结束')
# if dic_info:
# # 调凯歌接口存储数据
# data = json.dumps(dic_info)
# # print(data)
# url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
# for nnn in range(0, 3):
# try:
# res_baocun = requests.post(url_baocun, data=data)
# break
# except:
# time.sleep(1)
# print(res_baocun.text)
return dic_info
if __name__ == '__main__':
info_date_list = []
try:
chromedriver = "D:/chrome/chromedriver.exe"
browser = webdriver.Chrome(chromedriver)
except Exception as e:
print(e)
social_code = ''
#中英文名称映射
lrb_name_map = {
'营业总收入':'total_revenue',
'其中:营业收入':'revenue',
'营业总成本':'operating_costs',
'其中:营业成本':'operating_cost',
'营业税金及附加':'operating_taxes_and_surcharge',
'销售费用':'sales_fee',
'管理费用':'manage_fee',
'研发费用':'rad_cost',
'财务费用':'financing_expenses',
'其中:利息费用':'finance_cost_interest_fee',
'利息收入':'finance_cost_interest_income',
'资产减值损失':'asset_impairment_loss',
'信用减值损失':'credit_impairment_loss',
'加:公允价值变动收益':'',
'投资收益':'invest_income',
'其中:对联营企业和合营企业的投资收益':'',
'资产处置收益':'asset_disposal_income',
'其他收益':'other_income',
'营业利润':'op',
'加:营业外收入':'non_operating_income',
'其中:非流动资产处置利得':'',
'减:营业外支出':'non_operating_payout',
'其中:非流动资产处置损失':'',
'利润总额':'profit_total_amt',
'减:所得税费用':'income_tax_expenses',
'净利润差额(合计平衡项目)':'',
'净利润':'net_profit',
'(一)持续经营净利润':'continous_operating_np',
'归属于母公司股东的净利润':'net_profit_atsopc',
'少数股东损益':'minority_gal',
'扣除非经常性损益后的净利润':'net_profit_after_nrgal_atsolc',
'基本每股收益':'basic_eps',
'稀释每股收益':'dlt_earnings_per_share',
'其他综合收益':'othr_compre_income',
'归属母公司所有者的其他综合收益':'',
'综合收益总额':'total_compre_income',
'归属于母公司股东的综合收益总额':'net_profit_atsopc',
'归属于少数股东的综合收益总额':'total_compre_income_atms'
}
zcfzb_name_map = {
'货币资金':'currency_funds',
'交易性金融资产':'',
'应收票据及应收账款':'ar_and_br',
'其中:应收票据':'bills_receivable',
'应收账款':'account_receivable',
'预付款项':'pre_payment',
'应收利息':'',
'应收股利':'',
'其他应收款':'othr_receivables',
'存货':'inventory',
'合同资产':'',
'划分为持有待售的资产':'',
'一年内到期的非流动资产':'nca_due_within_one_year',
'其他流动资产':'intangible_assets',
'流动资产合计':'total_current_assets',
'可供出售金融资产':'',
'持有至到期投资':'',
'长期应收款':'',
'长期股权投资':'',
'其他权益工具投资':'',
'其他非流动金融资产':'',
'投资性房地产':'',
'固定资产合计':'fixed_asset_sum',
'其中:固定资产':'fixed_asset',
'固定资产清理':'',
'在建工程合计':'construction_in_process_sum',
'其中:在建工程':'construction_in_process',
'工程物资':'',
'生产性生物资产':'',
'油气资产':'',
'无形资产':'intangible_assets',
'开发支出':'dev_expenditure',
'商誉':'',
'长期待摊费用':'lt_deferred_expense',
'递延所得税资产':'dt_assets',
'其他非流动资产':'othr_noncurrent_assets',
'非流动资产合计':'total_noncurrent_assets',
'资产合计':'total_assets',
'短期借款':'st_loan',
'交易性金融负债':'',
'衍生金融负债':'',
'应付票据及应付账款':'accounts_payable',
'应付票据':'',
'应付账款':'',
'预收款项':'',
'合同负债':'contract_liabilities',
'应付职工薪酬':'payroll_payable',
'应交税费':'tax_payable',
'应付利息':'',
'应付股利':'',
'其他应付款':'othr_payables',
'划分为持有待售的负债':'',
'一年内到期的非流动负债':'noncurrent_liab_due_in1y',
'其他流动负债':'othr_current_liab',
'流动负债合计':'total_current_liab',
'长期借款':'lt_loan',
'应付债券':'',
'长期应付款合计':'lt_payable_sum',
'长期应付款':'lt_payable',
'专项应付款':'',
'预计负债':'',
'递延所得税负债':'dt_liab',
'递延收益-非流动负债':'',
'其他非流动负债':'',
'非流动负债合计':'total_noncurrent_liab',
'负债合计':'total_liab',
'实收资本(或股本)':'shares',
'其他权益工具':'',
'永续债':'',
'资本公积':'capital_reserve',
'减:库存股':'',
'其他综合收益':'',
'专项储备':'special_reserve',
'盈余公积':'earned_surplus',
'未分配利润':'undstrbtd_profit',
'一般风险准备':'',
'外币报表折算差额':'',
'归属于母公司股东权益合计':'total_quity_atsopc',
'少数股东权益':'minority_equity',
'股东权益合计':'total_holders_equity',
'负债和股东权益总计':'total_assets'
}
xjllb_name_map = {
'销售商品、提供劳务收到的现金':'cash_received_of_sales_service',
'收到的税费返还':'refund_of_tax_and_levies',
'收到其他与经营活动有关的现金':'cash_received_of_othr_oa',
'经营活动现金流入小计':'sub_total_of_ci_from_oa',
'购买商品、接受劳务支付的现金':'goods_buy_and_service_cash_pay',
'支付给职工以及为职工支付的现金':'cash_paid_to_employee_etc',
'支付的各项税费':'payments_of_all_taxes',
'支付其他与经营活动有关的现金':'othrcash_paid_relating_to_oa',
'经营活动现金流出小计':'sub_total_of_cos_from_oa',
'经营活动产生的现金流量净额':'ncf_from_oa',
'收回投资收到的现金':'cash_received_of_dspsl_invest',
'取得投资收益收到的现金':"othrcash_paid_relating_to_fa",
'处置固定资产、无形资产和其他长期资产收回的现金净额':'net_cash_of_disposal_assets',
'处置子公司及其他营业单位收到的现金净额':'',
'收到其他与投资活动有关的现金':'',
'投资活动现金流入小计':'sub_total_of_ci_from_ia',
'购建固定资产、无形资产和其他长期资产支付的现金':'cash_paid_for_assets',
'投资支付的现金':'invest_paid_cash',
'取得子公司及其他营业单位支付的现金净额':'',
'支付其他与投资活动有关的现金':'',
'投资活动现金流出小计':'sub_total_of_cos_from_ia',
'投资活动产生的现金流量净额':'ncf_from_ia',
'筹资活动产生的现金流量':'',
'吸收投资收到的现金':'cash_received_of_absorb_invest',
'其中:子公司吸收少数股东投资收到的现金':'',
'取得借款收到的现金':'cash_received_of_borrowing',
'发行债券收到的现金':'',
'收到其他与筹资活动有关的现金':'cash_received_of_othr_fa',
'筹资活动现金流入小计':'sub_total_of_ci_from_fa',
'偿还债务支付的现金':'cash_pay_for_debt',
'分配股利、利润或偿付利息支付的现金':'cash_paid_of_distribution',
'其中:子公司支付给少数股东的股利':'',
'支付其他与筹资活动有关的现金':'othrcash_paid_relating_to_fa',
'筹资活动现金流出小计':'sub_total_of_cos_from_fa',
'筹资活动产生的现金流量净额':'ncf_from_fa',
'汇率变动对现金及现金等价物的影响':'',
'现金及现金等价物净增加额':'net_increase_in_cce',
'加:期初现金及现金等价物余额':'final_balance_of_cce',
'期末现金及现金等价物余额':'final_balance_of_cce'
}
table_type = ['income','balance']
com_code = 'NQ' + '873286'
headers = {
'authority': 'stock.xueqiu.com',
'method': 'GET',
'path': '/v5/stock/finance/cn/income.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694414063178',
'scheme': 'https',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'device_id=84ced64554d8060750b1528dc22a3696; s=bt110kz0n8; cookiesu=661693188384462; u=661693188384462; Hm_lvt_1db88642e346389874251b5a1eded6e3=1693188388; xq_a_token=29bdb37dee2432c294425cc9e8f45710a62643a5; xqat=29bdb37dee2432c294425cc9e8f45710a62643a5; xq_r_token=3a35db27fcf5471898becda7aa5dab6afeafe471; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTY5NjgxMTc5NCwiY3RtIjoxNjk0NDEzNTQ2ODU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.Xxu329nQq4bMtwKFJWlScnUUSWrky4T5SWkMum46c2G8la2z4g0d4nyvsO08WP-7moMffId6P3bGWuELULkbv6EHvIZgqge9-fAD4-hmLOjeRh96NsoGfyTAQK7tbnt9LhKz1fDg6SUi8loMqYgM7l-4g-ZM4B6zrZ5hKWdQJFLy0-V8Wzx7HTFYZSX3FNSsbgGqHlW4vykIpsRaNeOOX1M6LYdt6BhbAi1Iv4TflB08LIdu6F1n4dTRbmPq1KCndb2LsLR2HrJZmqmHJB9WMzwlVcIGdz778_CutNrwuWgJbWtb-s3dSESzO0WWw1uIIGZUvRl1D0KSl0P_GQLw9w; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1694414056',
'Origin': 'https://xueqiu.com',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/snowman/S/NQ873286/detail',
'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
dic_info = getinfo(com_code)
"""
企业上市信息:只有上市的企业才能如企业库,未上市企业跳过采集步骤。退市企业标注为0
"""
import json
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# from gpdm import Gpdm
baseCore = BaseCore()
# chromedriver = r"E:\kkwork\zzsn_spider\comData\ipoInfo\chromedriver.exe"
# browser = webdriver.Chrome(chromedriver)
taskType = '上市信息/东方财富网/新三板'
# gpdm = Gpdm()
# gpdmList = gpdm.doJob()
log = baseCore.getLogger()
error_list = []
list_all_info = []
# 需要提供股票代码、企业信用代码
while True:
#从表中读取企业
# com_code1 = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code = '838616'
short_name = ''
social_code = ''
start = time.time()
log.info(f'======开始采集{com_code}======')
url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'qgqp_b_id=28edcf226f056ee077983f40f115eacf; st_si=15067486119520; emshistory=%5B%22%E4%BA%A7%E4%B8%9A%E9%93%BE%22%2C%22sz007sz%22%5D; websitepoptg_show_time=1694403032729; HAList=ty-0-002342-%u5DE8%u529B%u7D22%u5177%2Cty-0-301192-%u6CF0%u7965%u80A1%u4EFD%2Cty-1-688382-%u76CA%u65B9%u751F%u7269-U%2Cty-1-600895-%u5F20%u6C5F%u9AD8%u79D1%2Cty-1-600669-*ST%u978D%u6210%2Cty-116-00691-%u5C71%u6C34%u6C34%u6CE5%2Cty-0-300865-%u5927%u5B8F%u7ACB%2Cty-0-000656-%u91D1%u79D1%u80A1%u4EFD%2Cty-1-600257-%u5927%u6E56%u80A1%u4EFD%2Cty-1-688981-%u4E2D%u82AF%u56FD%u9645; xsb_history=833658%7C%u94C1%u8840%u79D1%u6280%2C838616%7C%u5317%u9CD0%u98DF%u54C1; st_asi=delete; st_pvi=44810095342512; st_sp=2023-07-18%2013%3A55%3A09; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=337; st_psi=20230914142347564-119112305908-4534169252',
'Host': 'xinsanban.eastmoney.com',
'Pragma': 'no-cache',
'Referer': 'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
req = requests.get(url=url,headers=headers)
reslut = BeautifulSoup(req.content,'html.parser')
# print(reslut)
li_list = reslut.find('div',id='company_info').find('ul',class_='company-page-left').find_all('li')
security = reslut.find('div',id='security_info').find('ul',class_='company-page-right').find_all('li')
listingDate = security[1].find('span',class_='company-page-item-right').text
businessScope = li_list[7].find('span',class_='company-page-item-right').text
industry = li_list[8].find('span',class_='company-page-item-right').text
secutities_type = '新三板'
category = '3'
exchange = '1'
dic_cwsj = {
"exchange": exchange,
"category": category, # 股票类型(1-A股;2-B股;3-新三板;4-H股)
'listed': '1',
"listingDate": listingDate,
"securitiesCode": com_code,
"securitiesShortName": short_name,
"securitiesType": secutities_type,
"socialCreditCode": social_code,
"businessScope": businessScope,
"eastIndustry": industry,
"csrcIndustry": ''
}
print(dic_cwsj)
break
# dic_cwsj = {
# "exchange": jys_code,
# "category": '1', # 股票类型(1-A股;2-B股;3-新三板;4-H股)
# 'listed':'1',
# "listingDate": shangshishijian,
# "securitiesCode": com_code[2:],
# "securitiesShortName": short_name,
# "securitiesType": '新三板',
# "socialCreditCode": id_code,
# "businessScope": zhuyingfanwei,
# "eastIndustry": dongcai,
# "csrcIndustry": zhengjian
# }
#
# list_all_info.append(dic_cwsj)
# log.info(f'======{com_code}====采集成功=====')
#
# # 通过接口将数据保存进数据库
# for num in range(0, len(list_all_info),100):
#
# json_updata = json.dumps(list_all_info[num:num+100])
# # print(json_updata)
# try:
# response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
# verify=False)
# except Exception as e:
# print(e)
# print("{}:到:{}".format(num, num + 100))
# print(response.text)
"""
"""
证监会公告采集,只能按照搜索企业来采,从上市库里拿企业数据,sys_enterprise_ipo_copy1
craw_state:已采集过表示为True,未采集表示为0,拿取数据表示为ing,解析失败表示为400
update_state:为1 表示需要更新,用来增量循环
如何统计出来该报告采到了没有,dt_error库统计失败的信息
"""
import json
import random
import re
import time
import fitz
import pymysql
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from datetime import datetime
from base import BaseCore
# from fdfs_client.client import get_tracker_conf, Fdfs_client
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
# tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会'
type_map = {
'zljgcs':'自律监管措施',
'wxh':'问询函',
'jlcf':'纪律处分',
'9506':'公司公告',
'9509':'公司公告',
'9503':'公司公告',
'9504':'公司公告',
'9505':'公司公告',
'9510':'公司公告',
'9520':'公司公告',
'9605':'公司公告',
'9533':'公司公告',
}
def secrchATT(item_id, name, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
cursor_.execute(sel_sql, (item_id, name, type_id))
selects = cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = secrchATT(item_id, pdf_name, type_id)
if selects:
log.info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
log.info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, pdf_name, type_id)
id = selects[0]
return id
def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)]
pattern = r"\(\[(.*?)\]\)"
for m in range(0, 3):
try:
response = requests.post(url=url, headers=headers, data=payload) # ,proxies=ip)
response.encoding = response.apparent_encoding
break
except Exception as e:
log.error(f"request请求异常----{m}-----{e}")
pass
# 检查响应状态码
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
match = re.search(pattern, str(soup))
if match:
retJsonData = match.group(1)
retJsonData = json.loads(retJsonData)
# retJsonData = response.json()
pass
else:
# 请求失败,输出错误信息
log.error('请求失败:', url)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
retJsonData = ''
return retJsonData
def getPages(url,com_code):
payload = f"startTime=&page=1&companyCd={com_code}&keyword=&disclosureType%5B%5D=9506&disclosureType%5B%5D=9509&disclosureType%5B%5D=9503&disclosureType%5B%5D=9504&disclosureType%5B%5D=9505&disclosureType%5B%5D=9510&disclosureType%5B%5D=9520&disclosureType%5B%5D=9605&disclosureType%5B%5D=9533&wxhType=wxh&zljgcsType=zljgcs&jlcfType=jlcf&newThreeArray%5B%5D=0&newThreeArray%5B%5D=1&newThreeArray%5B%5D=2&siteId=1&sortfield=publishDate&sorttype=desc&keyword1="
retJsonData = RequestUrl(url, payload, social_code, start_time)
# 第一次请求获取页数
# print(retJsonData)
totalPages = retJsonData['listInfo']['totalPages']
print(totalPages)
return totalPages
def InsterInto(short_name, social_code, pdf_url):
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='全国中小企业股份转让系统' and type='1' '''
cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone()
if selects:
print(f'com_name:{short_name}、{pdf_url}已存在')
return inster
# 信息插入数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
list_info = [
social_code,
pdf_url,
'全国中小企业股份转让系统',
'1',
]
#144数据库
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
insert = True
return insert
except:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#上传至文件服务器
retData = baseCore.upLoadToServe(pdf_url,8,social_code)
#附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
num = num + 1
att_id = tableUpdate(retData,com_name,year,pdf_name,num)
content = retData['content']
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '全国中小企业股份转让系统',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': pdf_url, # 原文链接
'summary': '',
'title': pdf_name,
'type': 3,
'socialCreditCode': social_code,
'year': year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result)
return False
# 采集信息
def SpiderByZJH(url, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息
okCount = 0
errorCount = 0
social_code = dic_info[2]
short_name = dic_info[4]
com_name = dic_info[1]
totalPages = getPages(url, com_code)
for i in range(0, int(totalPages)):
payload = f"startTime=&page={i}&companyCd={com_code}&keyword=&disclosureType%5B%5D=9506&disclosureType%5B%5D=9509&disclosureType%5B%5D=9503&disclosureType%5B%5D=9504&disclosureType%5B%5D=9505&disclosureType%5B%5D=9510&disclosureType%5B%5D=9520&disclosureType%5B%5D=9605&disclosureType%5B%5D=9533&wxhType=wxh&zljgcsType=zljgcs&jlcfType=jlcf&newThreeArray%5B%5D=0&newThreeArray%5B%5D=1&newThreeArray%5B%5D=2&siteId=1&sortfield=publishDate&sorttype=desc&keyword1="
retjson = RequestUrl(url, payload, social_code, start_time)
content_list = retjson['listInfo']['content']
for rp in content_list:
pdf_url = 'https://www.neeq.com.cn' + rp['destFilePath']
name_pdf = rp['disclosureTitle']
rp_type = type_map[rp['disclosureType']]
publishDate = rp['publishDate']
year = publishDate[:4]
# 数据入库
insert = InsterInto(short_name, social_code, name_pdf)
if insert:
# # 公告信息列表
# okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
log.info(f'======={short_name}===========插入公告库成功')
result = GetContent(pdf_url, name_pdf, social_code, year, publishDate, start_time, com_name, num)
if result:
# 公告信息列表
okCount = okCount + 1
log.info(f'{short_name}==============解析传输操作成功')
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
pass
else:
errorCount += 1
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.error(f'{short_name}=============解析或传输操作失败')
continue
else:
log.info(f'======={short_name}===========已存在')
continue
if __name__ == '__main__':
num = 0
headers = {
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '442',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'HOY_TR=FCEXZOPIKBGTYHDL,945C236781ABDFE0,xfslaodpytzTmieq; Hm_lvt_b58fe8237d8d72ce286e1dbd2fc8308c=1694480321; Hm_lpvt_b58fe8237d8d72ce286e1dbd2fc8308c=1694597182',
'Host': 'www.neeq.com.cn',
'Origin': 'https://www.neeq.com.cn',
'Pragma': 'no-cache',
'Referer': 'https://www.neeq.com.cn/products/neeq_listed_companies/related_announcement.html?companyCode=430054',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
dic_parms = {}
# 读取数据库获取股票代码 简称 以及 社会信用代码
while True:
start_time = time.time()
# # 获取企业信息
# # social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
social_code = '9110000071092841XX'
com_code = '430045'
short_name = '超毅网络'
dic_info = {}
# # 判断 如果Redis中已经没有数据,则等待
# if social_code == None:
# time.sleep(20)
# continue
# dic_info = baseCore.getInfomation(social_code)
# count = dic_info[16]
url = 'https://www.neeq.com.cn/disclosureInfoController/productInfoResult.do'
#翻页 page 0~ 25 totalPages
SpiderByZJH(url, dic_info, start_time, num)
cursor.close()
cnx.close()
# cursor_.close()
# cnx_.close()
# 释放资源
baseCore.close()
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
...@@ -19,10 +19,7 @@ from fdfs_client.client import get_tracker_conf, Fdfs_client ...@@ -19,10 +19,7 @@ from fdfs_client.client import get_tracker_conf, Fdfs_client
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cursor_ = cnx_.cursor() cursor_ = cnx_.cursor()
cnx = baseCore.cnx cnx = baseCore.cnx
...@@ -32,7 +29,7 @@ cursor = baseCore.cursor ...@@ -32,7 +29,7 @@ cursor = baseCore.cursor
tracker_conf = get_tracker_conf('./client.conf') tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会' taskType = '企业公告/证监会/福布斯'
def RequestUrl(url, payload, social_code,start_time): def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -142,30 +139,25 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type ...@@ -142,30 +139,25 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
inster = False inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s''' sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
cursor_.execute(sel_sql, (social_code, pdf_url)) cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor_.fetchone() selects = cursor.fetchone()
if selects: if selects:
print(f'com_name:{short_name}、{pdf_url}已存在') print(f'com_name:{short_name}、{pdf_url}已存在')
return inster return inster
# 信息插入数据库 # todo:信息插入数据库,更换数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,author,type) values(%s,%s,%s,%s,%s)'''
list_info = [ list_info = [
social_code, social_code,
name_pdf,
'', # 摘要
'', # 正文
pub_time, # 发布时间
pdf_url, pdf_url,
'证监会', '证监会',
report_type, report_type,
'1', '1'
'zh'
] ]
cursor_.execute(insert_sql, tuple(list_info)) cursor.execute(insert_sql, tuple(list_info))
cnx_.commit() cnx.commit()
insert = True insert = True
return insert return insert
except: except:
...@@ -201,6 +193,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time): ...@@ -201,6 +193,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message']) baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
return False return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:附件id需要上传至att数据库
dic_news = { dic_news = {
'attachmentIds': id, 'attachmentIds': id,
'author': '', 'author': '',
...@@ -372,33 +365,29 @@ if __name__ == '__main__': ...@@ -372,33 +365,29 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
# if social_code == None: if social_code == None:
# time.sleep(20) time.sleep(20)
# continue continue
# 获取企业信息 # 获取企业信息
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 is Null limit 1 " # query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 ='1' limit 1 "
# 兴业银行 # 兴业银行
query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'" # query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor.execute(query) # cursor.execute(query)
row = cursor.fetchone() # row = cursor.fetchone()
if row: # if row:
pass # pass
else: # else:
print('没有数据了,结束脚本') # print('没有数据了,结束脚本')
break # break
# tycid = row[14] # tycid = row[14]
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 1表示拿到数据
updateBeginSql = f"update Tfbs_bak set state2='1',date1='{time_now}' where col3='{social_code}' "
cursor.execute(updateBeginSql)
cnx.commit()
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
com_name = dic_info[1]
social_code = dic_info[2]
code = dic_info[3]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
count = dic_info[16] count = dic_info[16]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
...@@ -412,7 +401,6 @@ if __name__ == '__main__': ...@@ -412,7 +401,6 @@ if __name__ == '__main__':
# 根据股票代码选链接 # 根据股票代码选链接
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所 # 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所
code = dic_info[3]
short_name = dic_info[4] short_name = dic_info[4]
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls) dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
......
# _*_ coding:utf-8 _*_ # _*_ coding:utf-8 _*_
"""数据全量跑一遍,不做判重逻辑""" """数据全量跑一遍,不做判重逻辑"""
import datetime
import json import json
import re import re
import time import time
...@@ -8,7 +9,6 @@ import time ...@@ -8,7 +9,6 @@ import time
import fitz import fitz
import pymongo import pymongo
import requests import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from requests.packages import urllib3 from requests.packages import urllib3
...@@ -23,7 +23,8 @@ from selenium.webdriver.chrome.service import Service ...@@ -23,7 +23,8 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from lxml import etree from lxml import etree
from random import choice from random import choice
from bs4 import BeautifulSoup
from urllib.parse import urljoin
log = baseCore.getLogger() log = baseCore.getLogger()
taskType = '政策法规' taskType = '政策法规'
...@@ -37,14 +38,13 @@ taskType = '政策法规' ...@@ -37,14 +38,13 @@ taskType = '政策法规'
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1'] '国务院_国资委_copy1']
driver_path = r'D:\cmd100\chromedriver.exe' driver_path = r'D:\fbs_spider\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe' chromr_bin = r'D:\fbs_spider\Google\Chrome\Application\chrome.exe'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
} }
# 将html中的相对地址转换成绝对地址 # 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl): def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签 # 获取所有的<a>标签和<img>标签
...@@ -60,7 +60,6 @@ def paserUrl(html, listurl): ...@@ -60,7 +60,6 @@ def paserUrl(html, listurl):
link['src'] = urljoin(listurl, link['src']) link['src'] = urljoin(listurl, link['src'])
return html return html
def getDriver(): def getDriver():
service = Service(driver_path) service = Service(driver_path)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
...@@ -85,7 +84,6 @@ def getDriver(): ...@@ -85,7 +84,6 @@ def getDriver():
# }) # })
return bro return bro
def save_data(dic_news): def save_data(dic_news):
aaa_dic = { aaa_dic = {
...@@ -97,7 +95,6 @@ def save_data(dic_news): ...@@ -97,7 +95,6 @@ def save_data(dic_news):
} }
db_storage.insert_one(aaa_dic) db_storage.insert_one(aaa_dic)
def sendKafka(dic_news): def sendKafka(dic_news):
start_time = time.time() start_time = time.time()
try: # 114.116.116.241 try: # 114.116.116.241
...@@ -132,12 +129,10 @@ def sendKafka(dic_news): ...@@ -132,12 +129,10 @@ def sendKafka(dic_news):
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
return False return False
def redefid(idList): def redefid(idList):
id_ = ','.join(map(str, idList)) id_ = ','.join(map(str, idList))
return id_ return id_
def remove_dup(): def remove_dup():
pass pass
...@@ -310,7 +305,6 @@ def get_content1(): ...@@ -310,7 +305,6 @@ def get_content1():
end_time = time.time() end_time = time.time()
log.info(f'共抓取国务院文件{num}条数据,共耗时{start_time - end_time}') log.info(f'共抓取国务院文件{num}条数据,共耗时{start_time - end_time}')
# 国务院部门文件 # 国务院部门文件
def get_content2(): def get_content2():
def getTotalpage(bmfl,headers,session): def getTotalpage(bmfl,headers,session):
...@@ -460,7 +454,6 @@ def get_content2(): ...@@ -460,7 +454,6 @@ def get_content2():
end_time = time.time() end_time = time.time()
log.info(f'共抓取国务院部门文件{num}条数据,耗时{end_time - start_time}') log.info(f'共抓取国务院部门文件{num}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布 # 国务院国有资产监督管理委员会-政策发布
def get_content3(): def get_content3():
def getPage(): def getPage():
...@@ -612,12 +605,8 @@ def get_content3(): ...@@ -612,12 +605,8 @@ def get_content3():
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}') log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}')
partOne() partOne()
# 增量执行需要注释掉 # 增量执行需要注释掉partTwo()
partTwo() # partTwo()
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 北京 # 北京
def bei_jing(): def bei_jing():
...@@ -771,7 +760,6 @@ def bei_jing(): ...@@ -771,7 +760,6 @@ def bei_jing():
log.info(e) log.info(e)
pass pass
# 内蒙古 # 内蒙古
def nei_meng_gu(): def nei_meng_gu():
start = time.time() start = time.time()
...@@ -891,7 +879,6 @@ def nei_meng_gu(): ...@@ -891,7 +879,6 @@ def nei_meng_gu():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 吉林 # 吉林
def ji_lin(): def ji_lin():
start = time.time() start = time.time()
...@@ -1082,9 +1069,8 @@ def ji_lin(): ...@@ -1082,9 +1069,8 @@ def ji_lin():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 上海 # 上海
# todo:列表页没有获取完全 已完成
def shang_hai(): def shang_hai():
start = time.time() start = time.time()
num = 0 num = 0
...@@ -1216,7 +1202,6 @@ def shang_hai(): ...@@ -1216,7 +1202,6 @@ def shang_hai():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 浙江 # 浙江
def zhe_jiang(): def zhe_jiang():
start = time.time() start = time.time()
...@@ -1337,7 +1322,6 @@ def zhe_jiang(): ...@@ -1337,7 +1322,6 @@ def zhe_jiang():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 福建 # 福建
def fu_jian(): def fu_jian():
error_tag = str(404) error_tag = str(404)
...@@ -1385,12 +1369,11 @@ def fu_jian(): ...@@ -1385,12 +1369,11 @@ def fu_jian():
i_html = href_text.text i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href real_href = href
real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm' # real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href) # print(real_href)
# is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
# if is_href: if is_href:
# continue continue
try: try:
# 文章是远程pdf # 文章是远程pdf
# 直接下载文件至服务器,解析出正文内容 # 直接下载文件至服务器,解析出正文内容
...@@ -1496,9 +1479,7 @@ def fu_jian(): ...@@ -1496,9 +1479,7 @@ def fu_jian():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 山东 # 山东
# todo:成文日期 发文机关 已完成
def shan_dong(): def shan_dong():
headers = { headers = {
'Cookie': 'COLLCK=2502513302; COLLCK=2493627587', 'Cookie': 'COLLCK=2502513302; COLLCK=2493627587',
...@@ -1609,7 +1590,6 @@ def shan_dong(): ...@@ -1609,7 +1590,6 @@ def shan_dong():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 广东 # 广东
def guang_dong(): def guang_dong():
start = time.time() start = time.time()
...@@ -1710,7 +1690,6 @@ def guang_dong(): ...@@ -1710,7 +1690,6 @@ def guang_dong():
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 海南 # 海南
def hai_nan(): def hai_nan():
def hai_nan1(): def hai_nan1():
...@@ -2348,7 +2327,6 @@ def hai_nan(): ...@@ -2348,7 +2327,6 @@ def hai_nan():
hai_nan1() hai_nan1()
hai_nan2() hai_nan2()
# 四川 # 四川
def si_chuan(): def si_chuan():
num = 0 num = 0
...@@ -2442,7 +2420,6 @@ def si_chuan(): ...@@ -2442,7 +2420,6 @@ def si_chuan():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 广西 # 广西
def guang_xi(): def guang_xi():
num = 0 num = 0
...@@ -2567,7 +2544,6 @@ def guang_xi(): ...@@ -2567,7 +2544,6 @@ def guang_xi():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 贵州 # 贵州
def gui_zhou(): def gui_zhou():
""" """
...@@ -2677,7 +2653,6 @@ def gui_zhou(): ...@@ -2677,7 +2653,6 @@ def gui_zhou():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 云南 # 云南
def yun_nan(): def yun_nan():
def yun_nan1(): def yun_nan1():
...@@ -2908,7 +2883,6 @@ def yun_nan(): ...@@ -2908,7 +2883,6 @@ def yun_nan():
yun_nan1() yun_nan1()
yun_nan2() yun_nan2()
# 重庆 # 重庆
def chong_qing(): def chong_qing():
""" """
...@@ -3035,7 +3009,6 @@ def chong_qing(): ...@@ -3035,7 +3009,6 @@ def chong_qing():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 天津 # 天津
def tian_jin(): def tian_jin():
def tian_jin1(): def tian_jin1():
...@@ -3413,7 +3386,6 @@ def tian_jin(): ...@@ -3413,7 +3386,6 @@ def tian_jin():
tian_jin2() tian_jin2()
tian_jin3() tian_jin3()
# 新疆 # 新疆
def xin_jiang(): def xin_jiang():
def xin_jiang1(): def xin_jiang1():
...@@ -3620,7 +3592,6 @@ def xin_jiang(): ...@@ -3620,7 +3592,6 @@ def xin_jiang():
xin_jiang1() xin_jiang1()
xin_jiang_jsbt() xin_jiang_jsbt()
# 山西 # 山西
def shan_xi(): def shan_xi():
num = 0 num = 0
...@@ -3734,7 +3705,6 @@ def shan_xi(): ...@@ -3734,7 +3705,6 @@ def shan_xi():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 辽宁 # 辽宁
def liao_ning(): def liao_ning():
num = 0 num = 0
...@@ -3844,7 +3814,6 @@ def liao_ning(): ...@@ -3844,7 +3814,6 @@ def liao_ning():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 黑龙江 # 黑龙江
def hei_long_jiang(): def hei_long_jiang():
num = 0 num = 0
...@@ -3940,7 +3909,6 @@ def hei_long_jiang(): ...@@ -3940,7 +3909,6 @@ def hei_long_jiang():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 江苏 # 江苏
def jiang_su(): def jiang_su():
num = 0 num = 0
...@@ -4052,7 +4020,6 @@ def jiang_su(): ...@@ -4052,7 +4020,6 @@ def jiang_su():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 安徽 # 安徽
def an_hui(): def an_hui():
def an_hui1(): def an_hui1():
...@@ -4246,9 +4213,9 @@ def an_hui(): ...@@ -4246,9 +4213,9 @@ def an_hui():
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
an_hui1() an_hui1()
log.info('------test----')
an_hui2() an_hui2()
# 江西 # 江西
def jiang_xi(): def jiang_xi():
""" """
...@@ -4377,7 +4344,6 @@ def jiang_xi(): ...@@ -4377,7 +4344,6 @@ def jiang_xi():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 河南 # 河南
def he_nan(): def he_nan():
num = 0 num = 0
...@@ -4470,7 +4436,6 @@ def he_nan(): ...@@ -4470,7 +4436,6 @@ def he_nan():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 湖南 # 湖南
def hu_nan(): def hu_nan():
num = 0 num = 0
...@@ -4572,7 +4537,6 @@ def hu_nan(): ...@@ -4572,7 +4537,6 @@ def hu_nan():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 甘肃 # 甘肃
def gan_su(): def gan_su():
def gan_su1(): def gan_su1():
...@@ -4992,7 +4956,6 @@ def gan_su(): ...@@ -4992,7 +4956,6 @@ def gan_su():
gan_su2() gan_su2()
gan_su3() gan_su3()
# 宁夏 # 宁夏
def ning_xia(): def ning_xia():
num = 0 num = 0
...@@ -5087,7 +5050,6 @@ def ning_xia(): ...@@ -5087,7 +5050,6 @@ def ning_xia():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 陕西 # 陕西
def shanxi(): def shanxi():
num = 0 num = 0
...@@ -5188,7 +5150,6 @@ def shanxi(): ...@@ -5188,7 +5150,6 @@ def shanxi():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 西藏 # 西藏
def xi_zang(): def xi_zang():
start_time = time.time() start_time = time.time()
...@@ -5280,7 +5241,6 @@ def xi_zang(): ...@@ -5280,7 +5241,6 @@ def xi_zang():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 青海 # 青海
def qing_hai(): def qing_hai():
def qing_hai1(): def qing_hai1():
...@@ -5500,7 +5460,6 @@ def qing_hai(): ...@@ -5500,7 +5460,6 @@ def qing_hai():
qing_hai1() qing_hai1()
qing_hai2() qing_hai2()
# 河北 # 河北
def he_bei(): def he_bei():
num = 0 num = 0
...@@ -5591,7 +5550,6 @@ def he_bei(): ...@@ -5591,7 +5550,6 @@ def he_bei():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 湖北 # 湖北
def hu_bei(): def hu_bei():
num = 0 num = 0
...@@ -5701,39 +5659,42 @@ def hu_bei(): ...@@ -5701,39 +5659,42 @@ def hu_bei():
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__': if __name__ == '__main__':
get_content1() get_content1()
get_content2() get_content2()
get_content3() get_content3()
# bei_jing() bei_jing()
# nei_meng_gu() nei_meng_gu()
# ji_lin() ji_lin()
# shang_hai() shang_hai()
# zhe_jiang() zhe_jiang()
# fu_jian() fu_jian()
# shan_dong() shan_dong()
# guang_dong() guang_dong()
# hai_nan() hai_nan()
# si_chuan() si_chuan()
# guang_xi() guang_xi()
# gui_zhou() gui_zhou()
# yun_nan() yun_nan()
# chong_qing() chong_qing()
# tian_jin() tian_jin()
# xin_jiang() xin_jiang()
# shan_xi() shan_xi()
# liao_ning() liao_ning()
# hei_long_jiang() hei_long_jiang()
# jiang_su() jiang_su()
# an_hui() an_hui()
# jiang_xi() jiang_xi()
# he_nan() he_nan()
# hu_nan() hu_nan()
# gan_su() gan_su()
# ning_xia() ning_xia()
# xi_zang() xi_zang()
# shanxi() shanxi()
# qing_hai() qing_hai()
# he_bei() he_bei()
# qing_hai() qing_hai()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
import policy
import tingtype
import BaseCore
from apscheduler.schedulers.blocking import BlockingScheduler
basecore = BaseCore.BaseCore()
log = basecore.getLogger()
def policylaw_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(policy, 'cron', hour=0,minute=0)
scheduler.add_job(tingtype, 'cron', hour=0, minute=0)
try:
scheduler.start()
except Exception as e:
log.info('定时采集异常', e)
pass
policylaw_task()
\ No newline at end of file
import json import datetime
import datetime
import json import json
import random import random
import time import time
...@@ -277,6 +278,10 @@ if __name__=='__main__': ...@@ -277,6 +278,10 @@ if __name__=='__main__':
job() job()
except Exception as e: except Exception as e:
print(e) print(e)
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象 # 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx') # writer = pd.ExcelWriter('国务院厅局.xlsx')
......
import json
from bs4 import BeautifulSoup
import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# print(updateBeginSql)
cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
cnx_.commit()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论