2023/8/12

060ce7c4 · 薛凌堃 · aedff657 · 060ce7c4 · 060ce7c4
--- a/comData/annualReport_ZJH/fbs_annualreport.py
+++ b/comData/annualReport_ZJH/fbs_annualreport.py
--- a/comData/noticeReport_ZJH/fbs_notice.py
+++ b/comData/noticeReport_ZJH/fbs_notice.py
@@ -261,76 +261,78 @@ def SpiderByZJH(url, payload, dic_info, start_time):  # dic_info 数据库中获
    if soup == '':
        return False
    # 先获取页数
+    page = 0
    try:
        page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
    except:
        e = f"该企业没有{dic_parms['Catagory2']}数据"
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
-        baseCore.recordLog(social_code, taskType, state, takeTime, dic_parms['url'], 'Kafka操作失败')
+        baseCore.recordLog(social_code, taskType, state, takeTime, dic_parms['url'], f'{e}')
        return False
-    total = re.findall(r'\d+', page)[0]
+    if page != 0:
+        total = re.findall(r'\d+', page)[0]

-    r_page = int(total) % 15
-    if r_page == 0:
-        Maxpage = int(total) // 15
-    else:
-        Maxpage = int(total) // 15 + 1
-    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
-    # 首页和其他页不同，遍历 如果是首页 修改一下链接
-    for i in range(1, Maxpage + 1):
-        log.info(f'==========正在采集第{i}页=========')
-        if i == 1:
-            href = url
+        r_page = int(total) % 15
+        if r_page == 0:
+            Maxpage = int(total) // 15
        else:
-            # http://eid.csrc.gov.cn/101811/index_3_f.html
-            href = url.split('index')[0] + f'index_{i}_f.html'
-
-        soup = RequestUrl(href, payload, social_code, start_time)
-        if soup == '':
-            continue
-        tr_list = soup.find('div', id='txt').find_all('tr')
-        pageIndex = 0
-        for tr in tr_list[1:]:
-            pageIndex += 1
-            td_list = tr.find_all('td')
-            pdf_url_info = td_list[2]
-            # print(pdf_url)
-            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
-            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
-
-            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
-            year = pub_time[:4]
-            report_type = td_list[4].text.strip()
-
-            # 信息插入数据库
-            insert = InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type)
-            log.info(f'======={short_name}========{code}===插入公告库成功')
-            if insert:
-                #     # 公告信息列表
-                #     okCount = okCount + 1
-                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
-                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time)
-
-                if result:
-                    # 公告信息列表
-                    okCount = okCount + 1
-                    log.info(f'{short_name}==============解析传输操作成功')
-                    state = 1
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
-                    pass
-                else:
-                    errorCount += 1
-                    # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    log.error(f'{short_name}=============解析或传输操作失败')
-                    # try:
-                    #     insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
-                    #     cursor_.execute(insert_err_sql)
-                    #     cnx_.commit()
-                    # except:
-                    #     pass
-                    continue
+            Maxpage = int(total) // 15 + 1
+        log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
+        # 首页和其他页不同，遍历 如果是首页 修改一下链接
+        for i in range(1, Maxpage + 1):
+            log.info(f'==========正在采集第{i}页=========')
+            if i == 1:
+                href = url
+            else:
+                # http://eid.csrc.gov.cn/101811/index_3_f.html
+                href = url.split('index')[0] + f'index_{i}_f.html'
+
+            soup = RequestUrl(href, payload, social_code, start_time)
+            if soup == '':
+                continue
+            tr_list = soup.find('div', id='txt').find_all('tr')
+            pageIndex = 0
+            for tr in tr_list[1:]:
+                pageIndex += 1
+                td_list = tr.find_all('td')
+                pdf_url_info = td_list[2]
+                # print(pdf_url)
+                pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
+                name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
+
+                pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
+                year = pub_time[:4]
+                report_type = td_list[4].text.strip()
+
+                # 信息插入数据库
+                insert = InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type)
+                log.info(f'======={short_name}========{code}===插入公告库成功')
+                if insert:
+                    #     # 公告信息列表
+                    #     okCount = okCount + 1
+                    # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
+                    result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time)
+
+                    if result:
+                        # 公告信息列表
+                        okCount = okCount + 1
+                        log.info(f'{short_name}==============解析传输操作成功')
+                        state = 1
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
+                        pass
+                    else:
+                        errorCount += 1
+                        # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        log.error(f'{short_name}=============解析或传输操作失败')
+                        # try:
+                        #     insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
+                        #     cursor_.execute(insert_err_sql)
+                        #     cnx_.commit()
+                        # except:
+                        #     pass
+                        continue
    return True

 #state2