定制采集维护

96c1e391 · 薛凌堃 · 9bdb6dc9 · 96c1e391
--- a/comData/dingzhi/miit.py
+++ b/comData/dingzhi/miit.py
@@ -53,7 +53,7 @@ def paserUrl(html,listurl):


 def page_list():
-    for i in range(1,27):
+    for i in range(1,29):
        print(f"采集到第{i}页！！")
        aurl='https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?webId=8d828e408d90447786ddbe128d495e9e&pageId=1b56e5adc362428299dfc3eb444fe23a&parseType=buildstatic&pageType=column&tagId=右侧内容&tplSetId=209741b2109044b5b7695700b2bec37e&paramJson={"pageNo":[i],"pageSize":"24"}'
        url=aurl.replace('[i]',str(i))
@@ -63,14 +63,16 @@ def page_list():
        soup=paserUrl(html,'https://www.miit.gov.cn/zwgk/zcjd/index.html')
        html=str(soup.prettify())
        doc=pq(html)
-        ll=doc('li[class="cf"]')
+        # ll=doc('li[class="cf"]')
+        ll=doc('li')
        for list in ll:
            ldoc=pq(list)
-            title=ldoc('a').text()
+            # title=ldoc('a').text()
+            title=ldoc('a').attr('title')
            url=ldoc('a').attr('href')
            # url='https://www.miit.gov.cn'+url
            try:
-                flag=r.sismember('IN-20230829-0199',url)
+                flag=r.sismember('IN-20230829-0199-test',url)
                if flag:
                    print(f'信息已采集入库{title}')
                    continue
@@ -117,10 +119,15 @@ def sendTokafka(ddata):
    content=ddata['content']
    contentWithTag=ddata['contentWithTag']
    publishTime=ddata['publishTime']
-    sourceAddress=ddata['sourceAddress']
-    origin=ddata['origin']
+    if publishTime:
        time_format='%Y-%m-%d'
        publishDate=str(datetime.strptime(publishTime, time_format))
+    else:
+        publishDate = '1900-01-01'
+    sourceAddress=ddata['sourceAddress']
+    origin=ddata['origin']
+    # time_format='%Y-%m-%d'
+    # publishDate=str(datetime.strptime(publishTime, time_format))
    sid='1696452056436424706'
    info_code='IN-20230829-0199'
    aa_dict = {
@@ -138,7 +145,7 @@ def sendTokafka(ddata):
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    try:
        kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
-        r.sadd(info_code,sourceAddress)
+        r.sadd(info_code+'-test',sourceAddress)
        print('发送kafka结束')
    except Exception as e:
        print(e)