2/26

b52e4502 · 薛凌堃 · ca40e9aa · b52e4502 · b52e4502 · b52e4502
--- a/base/研究中心需更新企业.py
+++ b/base/研究中心需更新企业.py
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.schedulers.blocking import BlockingScheduler
+import pandas as pd
+import redis
+def putCom():
+    com_list = ['91210000558190456G', '914200001000115161', '911100007109310534', '9111000071093123XX',
+                '91110000100017643K', '91110000100018267J', '91110000MA01P657XY', '91230100127057741M',
+                '91440300190346175T', 'ZZSN22083000000003', '91110000400000720M', '911100001055722912',
+                '91110000100005220B', '911100001000094165', '91310000132200821H', '911100001000128855',
+                '91110000710924910P', '91110000710924929L', '911100007109225442', '9111000071092649XU',
+                '91310000MA1FL70B67', '911100007109311097', '912201011239989159', '911100007178306183',
+                '91310000MA7ALG04XG', '91110000100017707H', '91110000710929498G', '91110000100010249W',
+                '9151000062160427XG', '91310000MA1FL4B24G', '91110000400001889L', '9144030010001694XX',
+                '91110000100000825Q', '91110000100006194G', '91110000717828315T', '91110000100001043E',
+                '91110000MA005UCQ5P', '91110000710935732K', '91110000710930392Y', '91110000710930296M',
+                '911100007109303176', '91110000710925243K', '91110000100014071Q', '91110000100009563N',
+                '9111000071093107XN', '9111000010001002XD', '91110000100001852R', '91110000100001625L',
+                '911100001000080343', '91110000400008060U', '91110000101699383Q', '91110000100000489L',
+                '9111000071092868XL', '91110000100001035K', '911100004000011410', '91110000710933809D',
+                '91110000100010310K', '91133100MABRLCFR5Q', '91110000MA001HYK9X', '911100001000016682',
+                '911100007109279199', '12100000400010275N', '91110000710935636A', '91110000100024800K',
+                '9144000076384341X8', '91440000100005896P', '91110000MA01W8B394', '91110000717830650E',
+                '91110000100003057A', 'ZZSN22061600000001', '91310000MA1FL0LX06', '9111000010169286X1',
+                '91110000100010433L', '91110000100010660R', '91110000102016548J', '91110000100001676W',
+                '9111000071092200XY', '91133100MA0G9YKT8B', '9111000010000093XR', '91110000100006485K',
+                '91360702MA7FK4MR44', '91420100MA4L0GG411', '91110000101625149Q', '12100000400006022G',
+                '912302001285125661', '91110000100005888C', '911100007109250324', '91110000100024915R',
+                '9111000040000094XW', '91310000MA1FL1MMXL', '91110000100015058K', '91110000710929930X',
+                '91133100MA0GBL5F38', '9111000010000085X6', '91110000101100414N']
+    df = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx')
+    # 连接到Redis数据库
+    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+    for i in range(len(df)):
+        social_code = df['social_code'][i]
+        com_name = df['name'][i]
+        # print(social_code)
+        if social_code in com_list:
+            pass
+        else:
+            if 'ZZSN' in social_code or 'ZD' in social_code:
+                continue
+            else:
+                item = social_code + '|' + com_name
+                r.rpush('UpdateBasdeInfo:SocialCode_CompanyName', item)
+def putCom_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每个月执行一次
+    scheduler.add_job(putCom, 'cron', day=1, hour=0, minute=0)
+    try:
+        # redisPushData  # 定时开始前执行一次
+        # putCom()
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+if  __name__ == '__main__':
+    putCom_task()
\ No newline at end of file
--- a/comData/BaseInfo_qcc/test.py
+++ b/comData/BaseInfo_qcc/test.py
+import pandas as pd
+# from pandas import DataFrame as df
+import pymysql
 import redis
-# 连接到Redis
+cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
 r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+with cnx.cursor() as cursor:
+    select = """select relationName, relationId from klb_company"""
+    cursor.execute(select)
+    results = cursor.fetchall()
+    for result in results:
+        name = result[0]
+        xydm = result[1]
+        item = f'{name}|{xydm}'
+        r.rpush('SousuoBaidu:companyname', cell_value)
-# 列表名称
-list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
-# 获取列表中的所有元素
-elements = r.lrange(list_name, 0, -1)
-# 遍历列表中的元素
-for element in elements:
-    # 获取元素在列表中的数量
-    count = r.lrem(list_name, 0, element)
-    # 如果数量大于1，说明有重复值，删除多余的重复值
-    if count > 1:
-        r.lrem(list_name, count - 1, element)
-# 打印处理后的列表
-print(r.lrange(list_name, 0, -1))
--- a/comData/BaseInfo_qcc/test_1.py
+++ b/comData/BaseInfo_qcc/test_1.py
--- a/comData/important_meeting/zyqmshggldxzhy19.py
+++ b/comData/important_meeting/zyqmshggldxzhy19.py
 # 中央全面深化改革委员会会议
 import json
+import sys
 import time
+import redis
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
 from kafka import KafkaProducer
-headers = {
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
@@ -26,22 +32,50 @@ headers = {
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
 }
+headers = {
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                'Connection': 'keep-alive',
+                'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
+                'Host': 'news.12371.cn',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'none',
+                'Sec-Fetch-User': '?1',
+                'Upgrade-Insecure-Requests': '1',
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
+                'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
+                'sec-ch-ua-mobile': '?0',
+                'sec-ch-ua-platform': '"Windows"'
+                        }
 if __name__ == "__main__":
        # 中央全面深化改革委员会会议
+        r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
        # 中央全面深化改革领导小组会议
        # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
-        url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
+        url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
-        for url in url_list:
-                request = requests.get(url=url, headers=headers)
+        request = requests.get(url=url, headers=header)
        soup = BeautifulSoup(request.content, 'html.parser')
+        # print(soup)
        request.encoding = request.apparent_encoding
        # print(soup)
-                info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
+        # info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
-                ul_list = info_html.find_all('li')
+        info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
-                for ul in ul_list:
+        flag = 1
+        for info_html in info_html_list:
+                if flag == 1:
+                        info_code = 'IN-20230816-0004'
+                        sid = '1691633319715676162'
+                else:
+                        sid = '1691633869186277378'
+                        info_code = 'IN-20230816-0005'
+                ul_list = info_html.find('ul', class_='ul_list').find_all('li')
+                for ul in ul_list[::-1]:
                        publishDate_ = str(ul.find('span').text)
                        date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
                        publishDate = date_obj.strftime('%Y-%m-%d')
@@ -51,18 +85,27 @@ if __name__ == "__main__":
                        newsUrl = ul.find('a')['href']
                        summary = ul.find('a').text
                        # todo: 链接判重
-                        news_request = requests.get(url=newsUrl, headers=headers)
+                        try:
+                                flag = r.sismember(info_code, newsUrl)
+                                if flag:
+                                        log.info('信息已采集入库过')
+                                        continue
+                        except Exception as e:
+                                continue
+                        news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
                        news_soup = BeautifulSoup(news_request.content, 'html.parser')
-                        print(news_soup)
+                        # print(news_soup)
+                        try:
                                title = news_soup.find('h1', class_='big_title').text
                                source = news_soup.find('div', class_='title_bottom').find('i').text
                                contentwithTag = news_soup.find('div', class_='word')
                                content = contentwithTag.text
-                        if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
+                        except Exception as e:
-                                sid = '1691633319715676162'
+                                 log.error(f'解析网页出错{newsUrl}')
-                        else:
+                                 continue
-                                sid = '1691633869186277378'
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        dic_info ={
                                'id': '1681549361661489154' + str(int(time.time()*1000)),
                                'title': title,
@@ -79,6 +122,7 @@ if __name__ == "__main__":
                                'createDate': time_now,
                        }
+                        r.sadd(info_code, newsUrl)
                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                        try:
                                kafka_result = producer.send("research_center_fourth",
@@ -90,3 +134,4 @@ if __name__ == "__main__":
                                print('发送kafka异常！')
                        finally:
                                producer.close()
+                flag += 1
\ No newline at end of file
--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -56,7 +56,7 @@ if __name__=="__main__":
    url = "https://mp.weixin.qq.com/"
    browser.get(url)
    # 可改动
-    time.sleep(20)
+    time.sleep(80)
    s = requests.session()
    #获取到token和cookies

--- a/test.py
+++ b/test.py
@@ -170,5 +170,71 @@ for data in datas:
    #     f.write(dic_info_)
    # break
    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
-    req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
+    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
    log.info(req.text)
+# import re, datetime
+#
+#
+# def paserTime(publishtime):
+#     timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
+#     current_datetime = datetime.datetime.now()
+#     publishtime = publishtime.strip()
+#     print(publishtime)
+#
+#     try:
+#         if '年前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(days=365 * day)
+#             publishtime = current_datetime - delta
+#         elif '月前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(months=day)
+#             publishtime = current_datetime - delta
+#         elif '周前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(weeks=day)
+#             publishtime = current_datetime - delta
+#         elif '天前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(days=day)
+#             publishtime = current_datetime - delta
+#         elif '前天' in publishtime:
+#             delta = datetime.timedelta(days=2)
+#             publishtime = current_datetime - delta
+#         elif '昨天' in publishtime:
+#             current_datetime = datetime.datetime.now()
+#             delta = datetime.timedelta(days=1)
+#             publishtime = current_datetime - delta
+#         elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
+#             if '小时' in publishtime:
+#                 hour = publishtime.split("小时")[0]
+#             else:
+#                 hour = 0
+#             if hour != 0:
+#                 min = publishtime.split("小时")[1].split("分钟")[0]
+#             else:
+#                 min = publishtime.split("分钟")[0]
+#
+#             delta = datetime.timedelta(hours=int(hour), minutes=int(min))
+#             publishtime = current_datetime - delta
+#         elif '年' in publishtime and '月' in publishtime:
+#             time_format = '%Y年%m月%d日'
+#             publishtime = datetime.datetime.strptime(publishtime, time_format)
+#         elif '月' in publishtime and '日' in publishtime:
+#             current_year = current_datetime.year
+#             time_format = '%Y年%m月%d日'
+#             publishtime = str(current_year) + '年' + publishtime
+#             publishtime = datetime.datetime.strptime(publishtime, time_format)
+#     except Exception as e:
+#         print('时间解析异常！！')
+#     return publishtime
+#
+# if __name__ == "__main__":
+#     publishtime_ = '1小时17分钟前'
+#     publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
+#     print(publish_time)
\ No newline at end of file