lly 2024-08-02

e8838732 · LiuLiYuan · 63f0268b · e8838732 · e8838732 · e8838732
--- a/dingzhi/trandingEconomics.py
+++ b/dingzhi/trandingEconomics.py
--- a/xmzx/aieco.py
+++ b/xmzx/aieco.py
@@ -57,7 +57,7 @@ def doJob():
                origin = dataJson['source']
                if db_stroage.find_one({'原文链接': href}):
                    continue
-                if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
+                if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
                    continue
                log.info(f'开始采集==={title}')
                getDic(href, title, publishDate, origin, type[0])

--- a/xmzx/config.ini
+++ b/xmzx/config.ini
 [doJob]
 ;是否开始采集
+;验证列表页
 flg = False
+;验证详情页
 insertFlg = False
+;flg为False时，验证列表页
+;flg为True，insertFlg为False时，验证详情页
+;都为True时，采集数据

+
+
+;页码
 [page]
 ;网页爬取页数的起始与末尾
 ;需要从链接中判断
-begin = 1
+begin = 0
 end = 3

+;链接
 [home]
+;网站名称
 origin = 贸易投资网-贸易商机
 ;首页链接是否与后续链接有不同:是为True;不是为False
-urlFlg = False
+urlFlg = True
 ;如果首页链接与后续链接不同,需要填写该值
 ;样例   http://www.cgcjm.cecep.cn/g9222.aspx
-urlBegin = http://www.aieco.org/article/ywfc
+urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
 ;如果首页链接与后续链接不同,需要填写该值
-urlBeginNum = 1
+urlBeginNum = 0
 ;爬取网站链接的通用格式,页码处使用{}代替
 ;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
-url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket=
+url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html

+;列表页
 [homeSelect]
 ;资讯列表的select
-data_info_list = body > div.container > div > div > div
+data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
 ;标题所在  select
-title = h5
+title = a
 ;发布时间不在列表页显示  该值需要置为False
 publishDateFlg = True
-publishDate = span.format-datetime
+publishDate = span
 ;获取到时间的格式
 publishDateType = %%Y-%%m-%%d
 ;链接所在  select
 href = a

+;详情页
 [detailSelect]
 ;正文
-contentWithTag = #articleBody
+contentWithTag = #xwxq2 > div
 ;是否有来源
 originFlg = True
 ;来源
-origin = #leftList > div.content_article_source > table > tr > td.first
+origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
 ;发布时间   如果home中的publishDateFlg为False才需要配置
 publishDate = div.second-news-item-date
 ;获取到时间的格式
@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S

 [headers]
 ;请求头信息
-Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
-Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8
-Connectio = keep-alive
-Sec-Fetch-Dest = document
-Sec-Fetch-Mode = navigate
-Sec-Fetch-Site = same-origin
-Sec-Fetch-User = ?1
-Upgrade-Insecure-Requests = 1
-User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0
-sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"
-sec-ch-ua-mobile = ?0
-sec-ch-ua-platform = "Windows"
\ No newline at end of file
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
+Accept-Encoding: gzip, deflate
+Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
+Cache-Control: max-age=0
+Connection: keep-alive
+Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
+Host: www.camce.com.cn
+If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
+If-None-Match: "7b53-61d3e78f899c0-gzip"
+Upgrade-Insecure-Requests: 1
+User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
\ No newline at end of file
--- a/xmzx/crbc.py
+++ b/xmzx/crbc.py
@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):


 def doJob():
-    # for page in range(1, 13):
-    for page in range(0,5):
-        if page == 0:
-            url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
-        else:
-            url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
-        # url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
+    for page in range(1, 13):
+    # for page in range(0,5):
+    #     if page == 0:
+    #         url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
+    #     else:
+    #         url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
+        url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
        soup = getSoup(url)
        liList = soup.find('ul', class_='right-column-list').find_all('li')
        for liTag in liList:

--- a/xmzx/intlGcAvic.py
+++ b/xmzx/intlGcAvic.py
@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):

 def doJob():
    for page in range(1, 4):
-        url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
+        # url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
+        url = ''
        req = requests.get(url, headers=headers)
        req.encoding = req.apparent_encoding
        soup = BeautifulSoup(req.text, 'html.parser')

--- a/xmzx/test.py
+++ b/xmzx/test.py
@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
 from base import BaseCore
 from elasticsearch import Elasticsearch, helpers

-# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
-# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
+baseCore = BaseCore.BaseCore()
+
+# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
+db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
 db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
 db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
 es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
-baseCore = BaseCore.BaseCore()

 from powerchina import db_stroage

@@ -63,8 +64,8 @@ def select(sid):
                    {
                        "range": {
                            "publishDate": {
-                                "gte": "2023-01-01T00:00:00",
-                                "lte": "2023-12-31T23:59:59"
+                                "gte": "2022-01-01T00:00:00",
+                                "lte": "2022-12-31T23:59:59"
                            }
                        }
                    }
@@ -191,29 +192,109 @@ def select(sid):
 # df = pd.DataFrame(data_list)
 # df.to_excel('./项目资讯-定制.xlsx', index=False)

-sids = ['1811965474060091394',
-        '1811960555242528770',
-        '1811958016631644161',
-        '1811955892663336962',
-        '1811950817584857089',
-        '1811939863161716737',
-        '1811937580770402305',
-        '1811933720142135297',
-        '1811566665440186370',
-        '1810983037486170113',
-        '1810980529153966081',
-        '1810978470438567938',
-        '1810976012817707009',
-        '1810972790830858242',
-        '1810968708888068097',
-        '1810960658496102401',
-        '1810954505034969089',
-        '1810947397855879170']
-for sid in sids:
-    num = 0
-    datas = select(sid)
+# sids = ['1811965474060091394',
+#         '1811960555242528770',
+#         '1811958016631644161',
+#         '1811955892663336962',
+#         '1811950817584857089',
+#         '1811939863161716737',
+#         '1811937580770402305',
+#         '1811933720142135297',
+#         '1811566665440186370',
+#         '1810983037486170113',
+#         '1810980529153966081',
+#         '1810978470438567938',
+#         '1810976012817707009',
+#         '1810972790830858242',
+#         '1810968708888068097',
+#         '1810960658496102401',
+#         '1810954505034969089',
+#         '1810947397855879170']
+# for sid in sids:
+#     num = 0
+#     datas = select(sid)
+#     for data in datas:
+#         num += 1
+#     print(f'{sid}==={num}')
+
+
+# es获取数据
+def esData():
+    sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
+    baseCore.cursor_.execute(sql)
+    datas = baseCore.cursor_.fetchall()
+    dics = []
+    urlList = []
    for data in datas:
-        num += 1
-    print(f'{sid}==={num}')
+        sid = data[0]
+        sqlSelect = f'select web_site_name from info_source where id="{sid}"'
+        baseCore.cursor_.execute(sqlSelect)
+        web = baseCore.cursor_.fetchone()[0]
+        results = select(sid)
+        num = 0
+        for result in results:
+            try:
+                title = result['_source']['title']
+                publishDate = result['_source']['publishDate']
+                if len(publishDate) == 10:
+                    publishDate = publishDate + ' 00:00:00'
+                else:
+                    publishDate = publishDate.replace('T', ' ')
+                origin = result['_source']['origin']
+                content = result['_source']['content']
+                contentWithTag = result['_source']['contentWithTag']
+                url = result['_source']['sourceAddress']
+            except:
+                continue
+            if url in urlList:
+                continue
+            dic = {
+                '标题': title,
+                '发布时间': publishDate,
+                '来源': origin,
+                '网站': web,
+                '正文': content,
+                '正文带标签': str(contentWithTag),
+                '原文链接': url
+            }
+            dics.append(dic)
+            urlList.append(url)
+    df = pd.DataFrame(dics)
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+
+# 神采数据导出
+def shencai():
+    datas_ = []
+    datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
+    for data in datas:
+        title = data['标题']
+        href = data['URL']
+        origin = data['来源']
+        web = data['栏目']
+        content = data['正文不带标签']
+        contentWithTag = data['内容']
+        publishDate = data['日期']
+        # print(publishDate)
+        datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
+    df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+#mongodb 定制
+def dingzhi():
+    data_list = []
+    datas = db_stroageA.find()
+    for data in datas:
+        del data['_id']
+        db_stroageB.insert_one(data)
+        data_list.append(data)
+    df = pd.DataFrame(data_list)
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+if __name__ == '__main__':
+    # esData()
+    # shencai()
+    # dingzhi()
+    pass

 baseCore.close()
--- a/xmzx/testAll.py
+++ b/xmzx/testAll.py
@@ -46,10 +46,10 @@ class Spider():
            if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
                return
        if self.config.getboolean('detailSelect', 'originFlg'):
-            # origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
-            source = soup.find('body').find('script').text
-            source = re.findall('source = \"(.*?)\";', source)[0]
-            origin = source
+            origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
+            # source = soup.find('body').find('script').text
+            # source = re.findall('source = \"(.*?)\";', source)[0]
+            # origin = source
            try:
                try:
                    origin = origin.split('来源：')[1].strip()
@@ -88,8 +88,8 @@ class Spider():

    def doJob(self):
        for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
-            if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'):
-                url = self.config.get('sit', 'urlBegin')
+            if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
+                url = self.config.get('home', 'urlBegin')
            else:
                url = self.config.get('home', 'url').format(page)
            soup = getSoup(url, self.getHeader())
@@ -104,14 +104,14 @@ class Spider():
                    title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
                except:
                    continue
-                href = data_info.get('onclick')
-                href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
-                # href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
+                # href = data_info.get('onclick')
+                # href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
+                href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
                # href = data_info.get('href')
                if self.config.getboolean('homeSelect', 'publishDateFlg'):
-                    publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00'
+                    publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
                    # publishDate = href.split('net.cn')[1].split('art_')[0]
-                    # publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
+                    publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
                    if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
                        continue
                else:
@@ -121,13 +121,14 @@ class Spider():
                log.info(f'开始采集==={title}==={publishDate}==={href}')
                if not self.config.getboolean('doJob', 'flg'):
                    break
-                try:
-                    pass
-                    # self.getDic(href, title, publishDate)
-                except Exception as e:
-                    log.error(f'{title}===采集失败==={e}')
+                # try:
+                self.getDic(href, title, publishDate)
+                # except Exception as e:
+                #     log.error(f'{title}===采集失败==={e}')
+                if not self.config.getboolean('doJob', 'insertFlg'):
+                    break
                time.sleep(0.5)
-            if not self.config.getboolean('doJob', 'flg'):
+            if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
                break
            time.sleep(0.5)