政策法规

cb908caf · 薛凌堃 · 593410c8 · cb908caf · cb908caf · cb908caf
--- a/comData/annualReport_ZJH/证监会-年报.py
+++ b/comData/annualReport_ZJH/证监会-年报.py
-import json
+import json
@@ -124,78 +124,6 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                    print(f'com_name:{short_name}、{year}已存在')
                    continue
                else:
-                    # # 类型为年报的话就解析该年报pdf，并入库
-                    # for i in range(0, 3):
-                    #     try:
-                    #         resp_content = requests.request("GET", pdf_url).content
-                    #         # 获取pdf页数
-                    #         with fitz.open(stream=resp_content, filetype='pdf') as doc:
-                    #             page_size = doc.page_count
-                    #         break
-                    #     except Exception as e:
-                    #         print(e)
-                    #         time.sleep(3)
-                    #         continue
-                    # if page_size < 1:
-                    #     # pdf解析失败
-                    #     print(f'==={short_name}、{year}===pdf解析失败')
-                    #     state = 0
-                    #     takeTime = baseCore.getTimeCost(start_time, time.time())
-                    #     baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
-                    #     continue
-                    # result = ''
-                    # for i in range(0, 3):
-                    #     try:
-                    #         result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
-                    #         break
-                    #     except Exception as e:
-                    #         print(e)
-                    #         time.sleep(3)
-                    #         continue
-                    # if result == '':
-                    #     e = '上传服务器失败'
-                    #     state = 0
-                    #     takeTime = baseCore.getTimeCost(start_time, time.time())
-                    #     baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
-                    #     continue
-                    #
-                    # if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
-                    #
-                    #     time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    #
-                    #     type_id = '1'
-                    #     item_id = dic_info['social_code']
-                    #     group_name = 'group1'
-                    #
-                    #     path = bytes.decode(result['Remote file_id']).replace('group1', '')
-                    #     full_path = bytes.decode(result['Remote file_id'])
-                    #     category = 'pdf'
-                    #     file_size = result['Uploaded size']
-                    #     order_by = num
-                    #     status = 1
-                    #     create_by = 'XueLingKun'
-                    #     create_time = time_now
-                    #     page_size = page_size
-                    #     try:
-                    #         tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
-                    #                     category, file_size, order_by, status, create_by, create_time, page_size)
-                    #         state = 1
-                    #         takeTime = baseCore.getTimeCost(start_time, time.time())
-                    #         baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
-                    #     except:
-                    #         e = '数据库传输失败'
-                    #         state = 0
-                    #         takeTime = baseCore.getTimeCost(start_time, time.time())
-                    #         baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
-                    #     num = num + 1
-                    #     time.sleep(2)
-                    # else:
-                    #     e = '采集失败'
-                    #     state = 0
-                    #     takeTime = baseCore.getTimeCost(start_time, time.time())
-                    #     baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
-                    #     continue
-                    #上传至文件服务器
                    retData = baseCore.upLoadToServe(pdf_url, 1, social_code)
                    #插入数据库获取att_id
                    num = num + 1

--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
@@ -47,41 +47,6 @@ def replaceUrl(hostUrl,src):
    finnal_href = hostUrl + src
    return finnal_href

-def attachjob(fu_jian_soup,href):
-    for fu_jian_tag in fu_jian_soup:
-        try:
-            # 附件链接
-            fu_jian_href = fu_jian_tag['href']
-            pass
-        except:
-            continue
-        # todo:将链接替换为绝对路径
-        # todo:将附件上传至文件服务器，并返回文件服务器路径和attid,并替换 不用解析内容
-        if '.html' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or  'xls' in fu_jian_href or '.zip' in fu_jian_href \
-                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
-                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
-            if 'http' in fu_jian_href:
-                pass
-            else:
-                # 计算有多少个../
-                if '../' in fu_jian_href:
-                    count = fu_jian_href.count("../")
-                    if count == 1:
-                        hostUrl = 'https://gzw.beijing.gov.cn/xxfb/zcfg/'
-                    if count == 2:
-                        hostUrl = 'https://gzw.beijing.gov.cn/xxfb/'
-                    if count == 3:
-                        hostUrl = 'https://gzw.beijing.gov.cn/xxfb/'
-                else:
-                    if './' in fu_jian_href:
-                        hostUrl = href.split('/t')[0]
-
-                # 替换为绝对路径
-                fin_fj_href = replaceUrl(hostUrl, fu_jian_href)
-                # 将新路径替换标签中的路径
-                fu_jian_tag['href'] = fin_fj_href
-    return fu_jian_soup
-
 def save_data(result_dict):
    try:
        aa = result_dict['信息来源']
@@ -487,6 +452,21 @@ def get_content3():
    end_time = time.time()
    print(f'共抓取{num}条数据，耗时{end_time - start_time}')

+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+# 将html中的相对地址转换成绝对地址
+def paserUrl(html,listurl):
+    # soup = BeautifulSoup(html, 'html.parser')
+    # 获取所有的<a>标签和<img>标签
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+

 # 北京
 def bei_jing():
@@ -556,15 +536,14 @@ def bei_jing():
            cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')

            soup_cont = BeautifulSoup(cont,'lxml')
-            fu_jian_soup = soup_cont.find_all('a')
-            attachjob(fu_jian_soup,href[0])

-            print(fu_jian_soup)
-            # print(fu_jian_soup)
-            print(soup_cont)
-            print(title)
+            soup = paserUrl(soup_cont, href)
+            text = str(soup.prettify())
+            print(text)
+            # print(title)
            num = 0

+            fu_jian_soup = soup.find_all('a')
            for file in fu_jian_soup:
                num+=1
                file_href = file['href']

--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -58,7 +58,7 @@ if __name__=="__main__":
    url = "https://mp.weixin.qq.com/"
    browser.get(url)
    # 可改动
-    time.sleep(30)
+    time.sleep(70)

    s = requests.session()
    #获取到token和cookies