提交 71a0e996 作者: 薛凌堃

维护

上级 1ec2de55
......@@ -224,8 +224,8 @@ def BaseInfoEnterprise_task():
#企业核心人员
def CorPerson():
cnx, cursor = connectSql()
# gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
gn_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
# gn_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
cnx.commit()
......@@ -273,6 +273,19 @@ def FinanceFromEase_task():
print('定时采集异常', e)
pass
def JingyingfenxiFromEase():
cnx_, cursor_ = cnn11()
# 从上市企业库中读取数据
sql_sel = '''select social_credit_code from sys_base_enterprise_ipo where exchange = '1' or exchange = '2' or exchange = '3' and listed = '1' '''
cursor_.execute(sql_sel)
finance = cursor_.fetchall()
cnx_.commit()
finance_list = [item[0] for item in finance]
print('=======')
for item in finance_list:
r.rpush('Jingyingfenxi:finance_socialCode', item)
close11(cnx_, cursor_)
#微信公众号
def WeiXingetFromSql():
cnx_,cursor_=cnn11()
......@@ -645,7 +658,7 @@ if __name__ == "__main__":
# shuangbaiqiye()
# zhuangjingtexind()
# NoticeEnterprise()
NoticeDF()
# NoticeDF()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterprise()
......@@ -670,5 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task()
# FinanceFromEast()
# ipo_code()
JingyingfenxiFromEase()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
......@@ -39,24 +39,23 @@ class EsMethod(object):
body = {
"query": {
"bool": {
"must": [
{
"match": {
"type": "0"
}
},
{
"range": {
"createDate": {
"gte": "2023-12-13T00:00:00",
"lte": "2023-12-15T00:00:00"
}
"bool": {
"must_not": [
{
"exists": {
"field": "content"
}
}
],
"must": [
{
"match": {
"type": "1"
}
}
]
}
}
]
}
},
},
"sort": [
{
"createDate": {
......@@ -74,6 +73,7 @@ class EsMethod(object):
'hits.hits._source.title',
'hits.hits._source.sourceAddress',
'hits.hits._source.createDate',
'hits.hits._source.origin'
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
......@@ -86,9 +86,9 @@ def main(page, p, esMethod):
redis_conn = redis.Redis(connection_pool=pool)
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
if total == 0:
log.info('++++已没有数据+++++')
return
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
......@@ -100,13 +100,15 @@ def main(page, p, esMethod):
id = mms['_id']
title = mms['_source']['title']
sourceAddress = mms['_source']['sourceAddress']
log.info(f'{id}--{title}--{sourceAddress}---')
if redis_conn.lrem('YanBao:id', 0, id) == 0:
redis_conn.lpush('YanBao:id', id)
origin = mms['_source']['origin']
log.info(f'{id}--{title}--{origin}--{sourceAddress}---')
if origin == 'SEC美国证券交易委员会':
redis_conn.lrem('NianbaoUS:id', 0, id)
redis_conn.lpush('NianbaoUS:id', id)
else:
continue
redis_conn.lrem(f'NianbaoOT_{origin}:id', 0, id)
redis_conn.lpush(f'NianbaoOT_{origin}:id', id)
def run_threads(num_threads,esMethod,j):
threads = []
......@@ -126,7 +128,7 @@ def run_threads(num_threads,esMethod,j):
if __name__ == "__main__":
j = 0
for i in range(2):
for i in range(10):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
......
# -*- coding: utf-8 -*-
import json
import re
import threading
import time
import uuid
from urllib.parse import urljoin
import fitz
import redis
import requests
from bs4 import BeautifulSoup
from obs import ObsClient
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()
lock = threading.Lock()
pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name,id):
body = {
"query": {
"match": {
"id": id
}
}
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.title',
'hits.hits._source.socialCreditCode',
'hits.hits._source.sourceAddress'
# 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
def updateaunn(self, index_name, id, content, contentWithTag):
body = {
'doc': {
'content': content,
'contentWithTag': contentWithTag
}
}
result = self.es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def get_news(news_url,ip_dic):
header = {
'Host': 'www.sec.gov',
'Connection': 'keep-alive',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
}
response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
result = BeautifulSoup(response.content,'html.parser')
# print(result)
pass
else:
# 请求失败,输出错误信息
log.info('请求失败:', response.status_code, response.text)
result = ''
return result
def main(esMethod):
redis_conn = redis.Redis(connection_pool=pool)
id_ = redis_conn.lpop('NianbaoUS:id')
id = id_.decode()
# id = "23101317164"
if id:
pass
else:
log.info('已无数据')
return
result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
result = result_['hits']['hits'][0]
num = 0
title = result['_source']['title']
social_code = result['_source']['socialCreditCode']
# origin = result['_source']['origin']
log.info(f'====={title}=={social_code}===正在更新===')
sourceAddress = result['_source']['sourceAddress']
ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
soup = get_news(sourceAddress,ip_dic)
if soup:
pass
else:
return
# 相对路径转化为绝对路径
soup = paserUrl(soup, sourceAddress)
content = soup.text.strip()
esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
def run_threads(num_threads,esMethod):
threads = []
for i in range(num_threads):
thread = threading.Thread(target=main, args=(esMethod,))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
while True:
esMethod = EsMethod()
start = time.time()
num_threads = 5
run_threads(num_threads,esMethod)
log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file
......@@ -8,6 +8,13 @@ import urllib3
from pyquery import PyQuery as pq
import json
from kafka import KafkaProducer
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#务院政策问答平台最新发布信息采集
......@@ -16,37 +23,58 @@ def reqHtml(url,data,header):
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
json_data=json.dumps(data)
response = requests.post(url,data=json_data,headers=header,verify=False,timeout=10)
print(response.status_code)
log.info(response.status_code)
html=response.text
except Exception as e:
html=''
return html
def page_list():
# header = {
# 'Host':'xcx.www.gov.cn',
# 'Connection':'keep-alive',
# 'Content-Length':'25',
# 'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
# 'x-tif-did':'pb5XUGL1Zm',
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
# 'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
# 'Content-Type':'application/json',
# 'xweb_xhr':'1',
# 'dgd-pre-release':'0',
# 'x-yss-page':'publicService/pages/policyQALibrary/index/index',
# 'x-yss-city-code':'4400',
# 'Accept':'*/*',
# 'Sec-Fetch-Site':'cross-site',
# 'Sec-Fetch-Mode':'cors',
# 'Sec-Fetch-Dest':'empty',
# 'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
# 'Accept-Encoding':'gzip, deflate, br',
# 'Accept-Language':'zh-CN,zh;q=0.9'
# }
header = {
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code':'4400',
'Accept':'*/*',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9'
'Host': 'xcx.www.gov.cn',
'Connection': 'keep-alive',
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code': '4400',
'Accept': '*/*',
'Accept-Language': '*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,445):
print(f'采集第{i}页数据')
for i in range(1,453):
log.info(f'采集第{i}页数据')
k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
data=da.replace('[k]',str(k))
......@@ -56,7 +84,7 @@ def page_list():
hjson=json.loads(lhtml)
data=hjson['data']['list']
except Exception as e:
print(e)
log.info(e)
time.sleep(60)
continue
for ss in data:
......@@ -64,9 +92,9 @@ def page_list():
durl=f'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicy'
sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
try:
flag=r.sismember('IN-20230829-0146',sourceAddress)
flag=r.sismember('IN-20230829-0146-test',sourceAddress)
if flag:
print('信息已采集入库过')
log.info('信息已采集入库过')
continue
except Exception as e:
continue
......@@ -77,25 +105,25 @@ def page_list():
def detailpaser(dmsg):
hh={
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code':'4400',
'Accept':'*/*',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9'
'Host': 'xcx.www.gov.cn',
'Connection': 'keep-alive',
'Content-Length': '25',
'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
'x-tif-did': 'u8Ajuqdyap',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
'Content-Type': 'application/json',
'xweb_xhr': '1',
'dgd-pre-release': '0',
'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code': '4400',
'Accept': '*/*',
'Accept-Language': '*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br'
}
try:
durl=dmsg['url']
......@@ -107,8 +135,8 @@ def detailpaser(dmsg):
dd=json.loads(dhtml)
sendTokafka(dd)
except Exception as e:
print(e)
print(dhtml)
log.info(e)
# log.info(dhtml)
def sendTokafka(ddata):
dd=ddata['data']
......@@ -117,8 +145,11 @@ def sendTokafka(ddata):
content=dd['content']
contentWithTag=dd['content']
publishTime=dd['publishTime']
time_format='%Y年%m月%d日'
publishDate=str(datetime.datetime.strptime(publishTime, time_format))
if publishTime:
time_format='%Y年%m月%d日'
publishDate=str(datetime.datetime.strptime(publishTime, time_format))
else:
publishDate = '1900-01-01'
origin=dd['departmentName']
sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
sid='1696404919115825153'
......@@ -135,13 +166,14 @@ def sendTokafka(ddata):
'source': 'python定制采集',
'type': ''
}
log.info(aa_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code,sourceAddress)
print('发送kafka成功!')
r.sadd(info_code+'-test',sourceAddress)
log.info('发送kafka成功!')
except Exception as e:
print(e)
log.info(e)
finally:
producer.close()
# r.close()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论