lly 2024-08-02

e8838732 · LiuLiYuan · 63f0268b · e8838732 · e8838732 · e8838732
--- a/dingzhi/trandingEconomics.py
+++ b/dingzhi/trandingEconomics.py
@@ -6,21 +6,26 @@ import re
 import time
 import urllib.parse
 from urllib.parse import urljoin
-
+import sys
 import pymongo
 import requests
 import subprocess
 from functools import partial
+import traceback

 from bs4 import BeautifulSoup
 from retry import retry
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+# sys.path.append('D:\\zzsn_spider\\base')
+from base import BaseCore

 subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')
 import execjs
-from base import BaseCore

-baseCore = BaseCore.BaseCore(sqlflg=False)
+baseCore = BaseCore.BaseCore(sqlFlg=False)

+
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 log = baseCore.getLogger()
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
    'tradingEconomics']
@@ -60,7 +65,7 @@ def paserUrl(html, listurl):

 @retry(tries=2, delay=5)
 def getSoup(url):
-    req = requests.get(url, headers=headers)
+    req = requests.get(url, headers=headers,timeout=20)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'lxml')
    soup = paserUrl(soup, url)
@@ -74,8 +79,6 @@ def getCountries():
    soup = getSoup(url)
    div_list = soup.select('#ctl00_ContentPlaceHolder1_ctl01_tableCountries > div')
    for div_tag in div_list:
-        if 'G20' in div_tag.text:
-            continue
        li_list = div_tag.select('> ul > li')
        for li_tag in li_list:
            if 'active' in li_tag['class']:
@@ -91,18 +94,14 @@ def getIndex(country, url):
    soup = getSoup(url)
    li_list = soup.find('div', class_='pagetabs').find('ul', attrs={'id': 'pagemenutabs'}).find_all('li', class_='nav-item')
    div_list = soup.find('div', class_='tab-content').find_all('div', class_='tab-pane')
-    # for i in range(len(li_list)):
-    #     li_tag = li_list[i]
-    #     if 'Overview' in li_tag.find('a').text.strip():
-    #         del div_list[i]
-    #         break
+    for i in range(len(li_list)):
+        li_tag = li_list[i]
+        if 'Overview' in li_tag.find('a').text.strip():
+            del div_list[i]
+            break
    for i in range(len(div_list)):
        div_tag = div_list[i]
-        try:
        tr_list = div_tag.find('table').find('tbody').find_all('tr')
-        except:
-            print(url, i, sep='===')
-            continue
        for tr_tag in tr_list:
            option = tr_tag.find('td').find('a').text.strip()
            href = tr_tag.find('td').find('a').get('href')
@@ -124,24 +123,32 @@ def getTimeKey(timeType, calendarStr):
            timeKey = calendarStr[:4] + 'Q4'
    elif timeType == 'yearly':
        timeKey = calendarStr[:4]
+    elif timeType == 'weekly':
+        date_obj = datetime.datetime.strptime(calendarStr,'%Y-%m-%d')
+        year, week, day = date_obj.isocalendar()
+        if len(str(week)) == 1:
+            week = f'0{week}'
+        timeKey = f'{year}{week}'
    else:
-        timeKey = False
+        timeKey = calendarStr.replace('-','')
    return timeKey

+@retry(tries=3, delay=5)
+def decrypt(value,js_runtime):
+    dataJson = js_runtime.call('doJob', value, key)
+    return dataJson

 @retry(tries=3, delay=10)
-def getJson(url):
-    req = requests.get(url, headers=headers, timeout=20, verify=False)
+def getJson(url,js_runtime):
+    req = requests.get(url, headers=headers, timeout=20,verify=False)
    value = req.text.replace('"', '')
    req.close()
-    with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
-        js = f.read()
-    js_runtime = execjs.compile(js)
    try:
-        dataJson = js_runtime.call('doJob', value, key)
-    except:
-        print(value)
+        dataJson = decrypt(value,js_runtime)
+    except Exception as e:
+        log.error(e)
        raise
+    if dataJson:
        try:
            dataJson = json.loads(dataJson)[0]
        except:
@@ -149,7 +156,8 @@ def getJson(url):
    return dataJson


-def getData(url):
+def getData(url,js_runtime):
+    type_list = ['1w','1m','6m','1y','5y','10y','25y','50y','max','all']
    createTime = datetime.datetime.now()
    createTimeStr = createTime.strftime('%Y-%m-%d')
    soup = getSoup(url)
@@ -161,7 +169,7 @@ def getData(url):
                break
    else:
        log.error(f'数据链接获取失败==={url}')
-        return
+        return False
    for script in scripts:
        if 'TEChartsToken' in script.text:
            TEChartsToken = re.findall('TEChartsToken = \'(.*?)\'', script.text)[0]
@@ -169,10 +177,18 @@ def getData(url):
                break
    else:
        log.error(f'数据链接获取失败==={url}')
-        return
+        return False
    TESymbol = TESymbol.lower()
-    href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span=max&v=20240102145900&key={TEChartsToken}'
-    dataJson = getJson(href)
+    for type in type_list:
+        href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span={type}&v=20240102145900&key={TEChartsToken}'
+        try:
+            dataJson = getJson(href,js_runtime)
+        except Exception as e:
+            log.error(f'{type}===数据请求失败==={e}')
+            return False
+        #series = dataJson['series'][-10:]
+        if not dataJson:
+            continue
        series = dataJson['series']
        for serie_ in series:
            serie = serie_['serie']
@@ -202,9 +218,10 @@ def getData(url):
                }
                if db_storage.find_one({'country': country, 'indicators': indicators, 'timeType': timeType, 'calendarStr': calendarStr}):
                    log.info(f'{country}==={indicators}==={calendarStr}===已采集')
-                break
+                    continue
                db_storage.insert_one(dic)
                log.info(f'{country}==={indicators}==={calendarStr}===入库成功')
+    return True


 def doJob():
@@ -215,10 +232,22 @@ def doJob():
            info = f'{index[0]}|{index[1]}|{index[2]}'
            baseCore.r.rpush('trandingEconomics:info', info)
    log.info('数据已全部放入redis中')
+    # log.info(f'开始采集==={index[0]}==={index[1]}')
+    # getData(index[2])


 def doJobA():
+    try:
+        with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
+            js = f.read()
+        execjs.get('Node')
+        js_runtime = execjs.compile(js)
+    except:
+        return
+    errorNum = 0
    while True:
+        if errorNum > 10:
+            break
        info = baseCore.r.blpop(['trandingEconomics:info'], 2)
        if not info:
            log.info('数据已全部采集完成')
@@ -228,25 +257,21 @@ def doJobA():
        country = info.split('|')[0]
        index = info.split('|')[1]
        url = info.split('|')[2]
+        if url.endswith('/rating'):
+            continue
        log.info(f'开始采集==={country}==={index}')
        try:
-            getData(url)
+            if getData(url,js_runtime):
+                pass
+            else:
+                errorNum += 1
+                baseCore.r.rpush('trandingEconomics:info', info)
        except Exception as e:
+            errorNum += 1
+            traceback.print_exc()
            log.error(f'{country}==={index}===采集失败==={e}')
            baseCore.r.rpush('trandingEconomics:info', info)


 if __name__ == "__main__":
-    # testA()
-    # doJob()
-    # Peru===Currency
-    # Bulgaria===Stock Market
-    # Bulgaria===Interest Rate
-    # infoLens = baseCore.r.llen('trandingEconomics:info')
-    # print(infoLens)
-    jiami = 'a/lpZGluZ2VjbOvgLCKnQz3mlyvrByT3TUWWTZ/Bt9RVQx5xnQYsCU4fVSSo3ZGypEPLdDwREI65v+hkHO32iRzgmdYJc3AZFO6drPcW7yzvT7ovG7g4qxA1n3kxhiEQ808R90cOX+DZdz2H+xeTxuDmi/Un7sLeUZCPe3TS0sayhyPwOhUjXx/fFk2agaDz4pU0xWL34265lqd4zZSkAwwcpX/eLI5BvDEHKP61naRAHNgUIaX1g9DoyYzV9Mi6bu7gvSDvpkcvwyQ6WiOaoSpjI4vK2Kdt2SgJu92zedyrjpmpSjFfjnEf2Y6tIjgTY480acLmcWkt'
-    with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
-        js = f.read()
-    js_runtime = execjs.compile(js)
-    dataJson = js_runtime.call('doJob', jiami, key)
-    print(dataJson)
+    doJobA()
--- a/xmzx/aieco.py
+++ b/xmzx/aieco.py
@@ -57,7 +57,7 @@ def doJob():
                origin = dataJson['source']
                if db_stroage.find_one({'原文链接': href}):
                    continue
-                if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
+                if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
                    continue
                log.info(f'开始采集==={title}')
                getDic(href, title, publishDate, origin, type[0])

--- a/xmzx/config.ini
+++ b/xmzx/config.ini
 [doJob]
 ;是否开始采集
+;验证列表页
 flg = False
+;验证详情页
 insertFlg = False
+;flg为False时，验证列表页
+;flg为True，insertFlg为False时，验证详情页
+;都为True时，采集数据

+
+
+;页码
 [page]
 ;网页爬取页数的起始与末尾
 ;需要从链接中判断
-begin = 1
+begin = 0
 end = 3

+;链接
 [home]
+;网站名称
 origin = 贸易投资网-贸易商机
 ;首页链接是否与后续链接有不同:是为True;不是为False
-urlFlg = False
+urlFlg = True
 ;如果首页链接与后续链接不同,需要填写该值
 ;样例   http://www.cgcjm.cecep.cn/g9222.aspx
-urlBegin = http://www.aieco.org/article/ywfc
+urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
 ;如果首页链接与后续链接不同,需要填写该值
-urlBeginNum = 1
+urlBeginNum = 0
 ;爬取网站链接的通用格式,页码处使用{}代替
 ;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
-url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket=
+url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html

+;列表页
 [homeSelect]
 ;资讯列表的select
-data_info_list = body > div.container > div > div > div
+data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
 ;标题所在  select
-title = h5
+title = a
 ;发布时间不在列表页显示  该值需要置为False
 publishDateFlg = True
-publishDate = span.format-datetime
+publishDate = span
 ;获取到时间的格式
 publishDateType = %%Y-%%m-%%d
 ;链接所在  select
 href = a

+;详情页
 [detailSelect]
 ;正文
-contentWithTag = #articleBody
+contentWithTag = #xwxq2 > div
 ;是否有来源
 originFlg = True
 ;来源
-origin = #leftList > div.content_article_source > table > tr > td.first
+origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
 ;发布时间   如果home中的publishDateFlg为False才需要配置
 publishDate = div.second-news-item-date
 ;获取到时间的格式
@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S

 [headers]
 ;请求头信息
-Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
-Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8
-Connectio = keep-alive
-Sec-Fetch-Dest = document
-Sec-Fetch-Mode = navigate
-Sec-Fetch-Site = same-origin
-Sec-Fetch-User = ?1
-Upgrade-Insecure-Requests = 1
-User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0
-sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"
-sec-ch-ua-mobile = ?0
-sec-ch-ua-platform = "Windows"
\ No newline at end of file
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
+Accept-Encoding: gzip, deflate
+Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
+Cache-Control: max-age=0
+Connection: keep-alive
+Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
+Host: www.camce.com.cn
+If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
+If-None-Match: "7b53-61d3e78f899c0-gzip"
+Upgrade-Insecure-Requests: 1
+User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
\ No newline at end of file
--- a/xmzx/crbc.py
+++ b/xmzx/crbc.py
@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):


 def doJob():
-    # for page in range(1, 13):
-    for page in range(0,5):
-        if page == 0:
-            url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
-        else:
-            url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
-        # url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
+    for page in range(1, 13):
+    # for page in range(0,5):
+    #     if page == 0:
+    #         url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
+    #     else:
+    #         url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
+        url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
        soup = getSoup(url)
        liList = soup.find('ul', class_='right-column-list').find_all('li')
        for liTag in liList:

--- a/xmzx/intlGcAvic.py
+++ b/xmzx/intlGcAvic.py
@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):

 def doJob():
    for page in range(1, 4):
-        url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
+        # url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
+        url = ''
        req = requests.get(url, headers=headers)
        req.encoding = req.apparent_encoding
        soup = BeautifulSoup(req.text, 'html.parser')

--- a/xmzx/test.py
+++ b/xmzx/test.py
@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
 from base import BaseCore
 from elasticsearch import Elasticsearch, helpers

-# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
-# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
+baseCore = BaseCore.BaseCore()
+
+# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
+db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
 db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
 db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
 es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
-baseCore = BaseCore.BaseCore()

 from powerchina import db_stroage

@@ -63,8 +64,8 @@ def select(sid):
                    {
                        "range": {
                            "publishDate": {
-                                "gte": "2023-01-01T00:00:00",
-                                "lte": "2023-12-31T23:59:59"
+                                "gte": "2022-01-01T00:00:00",
+                                "lte": "2022-12-31T23:59:59"
                            }
                        }
                    }
@@ -191,29 +192,109 @@ def select(sid):
 # df = pd.DataFrame(data_list)
 # df.to_excel('./项目资讯-定制.xlsx', index=False)

-sids = ['1811965474060091394',
-        '1811960555242528770',
-        '1811958016631644161',
-        '1811955892663336962',
-        '1811950817584857089',
-        '1811939863161716737',
-        '1811937580770402305',
-        '1811933720142135297',
-        '1811566665440186370',
-        '1810983037486170113',
-        '1810980529153966081',
-        '1810978470438567938',
-        '1810976012817707009',
-        '1810972790830858242',
-        '1810968708888068097',
-        '1810960658496102401',
-        '1810954505034969089',
-        '1810947397855879170']
-for sid in sids:
+# sids = ['1811965474060091394',
+#         '1811960555242528770',
+#         '1811958016631644161',
+#         '1811955892663336962',
+#         '1811950817584857089',
+#         '1811939863161716737',
+#         '1811937580770402305',
+#         '1811933720142135297',
+#         '1811566665440186370',
+#         '1810983037486170113',
+#         '1810980529153966081',
+#         '1810978470438567938',
+#         '1810976012817707009',
+#         '1810972790830858242',
+#         '1810968708888068097',
+#         '1810960658496102401',
+#         '1810954505034969089',
+#         '1810947397855879170']
+# for sid in sids:
+#     num = 0
+#     datas = select(sid)
+#     for data in datas:
+#         num += 1
+#     print(f'{sid}==={num}')
+
+
+# es获取数据
+def esData():
+    sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
+    baseCore.cursor_.execute(sql)
+    datas = baseCore.cursor_.fetchall()
+    dics = []
+    urlList = []
+    for data in datas:
+        sid = data[0]
+        sqlSelect = f'select web_site_name from info_source where id="{sid}"'
+        baseCore.cursor_.execute(sqlSelect)
+        web = baseCore.cursor_.fetchone()[0]
+        results = select(sid)
        num = 0
-    datas = select(sid)
+        for result in results:
+            try:
+                title = result['_source']['title']
+                publishDate = result['_source']['publishDate']
+                if len(publishDate) == 10:
+                    publishDate = publishDate + ' 00:00:00'
+                else:
+                    publishDate = publishDate.replace('T', ' ')
+                origin = result['_source']['origin']
+                content = result['_source']['content']
+                contentWithTag = result['_source']['contentWithTag']
+                url = result['_source']['sourceAddress']
+            except:
+                continue
+            if url in urlList:
+                continue
+            dic = {
+                '标题': title,
+                '发布时间': publishDate,
+                '来源': origin,
+                '网站': web,
+                '正文': content,
+                '正文带标签': str(contentWithTag),
+                '原文链接': url
+            }
+            dics.append(dic)
+            urlList.append(url)
+    df = pd.DataFrame(dics)
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+
+# 神采数据导出
+def shencai():
+    datas_ = []
+    datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
    for data in datas:
-        num += 1
-    print(f'{sid}==={num}')
+        title = data['标题']
+        href = data['URL']
+        origin = data['来源']
+        web = data['栏目']
+        content = data['正文不带标签']
+        contentWithTag = data['内容']
+        publishDate = data['日期']
+        # print(publishDate)
+        datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
+    df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+#mongodb 定制
+def dingzhi():
+    data_list = []
+    datas = db_stroageA.find()
+    for data in datas:
+        del data['_id']
+        db_stroageB.insert_one(data)
+        data_list.append(data)
+    df = pd.DataFrame(data_list)
+    df.to_excel('./项目资讯-定制.xlsx', index=False)
+
+if __name__ == '__main__':
+    # esData()
+    # shencai()
+    # dingzhi()
+    pass

 baseCore.close()
--- a/xmzx/testAll.py
+++ b/xmzx/testAll.py
@@ -46,10 +46,10 @@ class Spider():
            if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
                return
        if self.config.getboolean('detailSelect', 'originFlg'):
-            # origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
-            source = soup.find('body').find('script').text
-            source = re.findall('source = \"(.*?)\";', source)[0]
-            origin = source
+            origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
+            # source = soup.find('body').find('script').text
+            # source = re.findall('source = \"(.*?)\";', source)[0]
+            # origin = source
            try:
                try:
                    origin = origin.split('来源：')[1].strip()
@@ -88,8 +88,8 @@ class Spider():

    def doJob(self):
        for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
-            if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'):
-                url = self.config.get('sit', 'urlBegin')
+            if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
+                url = self.config.get('home', 'urlBegin')
            else:
                url = self.config.get('home', 'url').format(page)
            soup = getSoup(url, self.getHeader())
@@ -104,14 +104,14 @@ class Spider():
                    title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
                except:
                    continue
-                href = data_info.get('onclick')
-                href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
-                # href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
+                # href = data_info.get('onclick')
+                # href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
+                href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
                # href = data_info.get('href')
                if self.config.getboolean('homeSelect', 'publishDateFlg'):
-                    publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00'
+                    publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
                    # publishDate = href.split('net.cn')[1].split('art_')[0]
-                    # publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
+                    publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
                    if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
                        continue
                else:
@@ -121,13 +121,14 @@ class Spider():
                log.info(f'开始采集==={title}==={publishDate}==={href}')
                if not self.config.getboolean('doJob', 'flg'):
                    break
-                try:
-                    pass
-                    # self.getDic(href, title, publishDate)
-                except Exception as e:
-                    log.error(f'{title}===采集失败==={e}')
+                # try:
+                self.getDic(href, title, publishDate)
+                # except Exception as e:
+                #     log.error(f'{title}===采集失败==={e}')
+                if not self.config.getboolean('doJob', 'insertFlg'):
+                    break
                time.sleep(0.5)
-            if not self.config.getboolean('doJob', 'flg'):
+            if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
                break
            time.sleep(0.5)