提交 96c1e391 作者: 薛凌堃

定制采集维护

上级 9bdb6dc9
......@@ -53,7 +53,7 @@ def paserUrl(html,listurl):
def page_list():
for i in range(1,27):
for i in range(1,29):
print(f"采集到第{i}页!!")
aurl='https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?webId=8d828e408d90447786ddbe128d495e9e&pageId=1b56e5adc362428299dfc3eb444fe23a&parseType=buildstatic&pageType=column&tagId=右侧内容&tplSetId=209741b2109044b5b7695700b2bec37e&paramJson={"pageNo":[i],"pageSize":"24"}'
url=aurl.replace('[i]',str(i))
......@@ -63,14 +63,16 @@ def page_list():
soup=paserUrl(html,'https://www.miit.gov.cn/zwgk/zcjd/index.html')
html=str(soup.prettify())
doc=pq(html)
ll=doc('li[class="cf"]')
# ll=doc('li[class="cf"]')
ll=doc('li')
for list in ll:
ldoc=pq(list)
title=ldoc('a').text()
# title=ldoc('a').text()
title=ldoc('a').attr('title')
url=ldoc('a').attr('href')
# url='https://www.miit.gov.cn'+url
try:
flag=r.sismember('IN-20230829-0199',url)
flag=r.sismember('IN-20230829-0199-test',url)
if flag:
print(f'信息已采集入库{title}')
continue
......@@ -117,10 +119,15 @@ def sendTokafka(ddata):
content=ddata['content']
contentWithTag=ddata['contentWithTag']
publishTime=ddata['publishTime']
sourceAddress=ddata['sourceAddress']
origin=ddata['origin']
if publishTime:
time_format='%Y-%m-%d'
publishDate=str(datetime.strptime(publishTime, time_format))
else:
publishDate = '1900-01-01'
sourceAddress=ddata['sourceAddress']
origin=ddata['origin']
# time_format='%Y-%m-%d'
# publishDate=str(datetime.strptime(publishTime, time_format))
sid='1696452056436424706'
info_code='IN-20230829-0199'
aa_dict = {
......@@ -138,7 +145,7 @@ def sendTokafka(ddata):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code,sourceAddress)
r.sadd(info_code+'-test',sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论