0717

c887e9d2 · XveLingKun · 57e944a7 · c887e9d2 · c887e9d2 · c887e9d2
--- a/612test.py
+++ b/612test.py
--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -52,8 +52,10 @@ if __name__ == "__main__":
        opt.add_experimental_option("excludeSwitches", ["enable-automation"])
        opt.add_experimental_option('excludeSwitches', ['enable-logging'])
        opt.add_experimental_option('useAutomationExtension', False)
-        opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe'
+        # opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe'
-        chromedriver = r'F:\spider\cmd100\chromedriver.exe'
+        # chromedriver = r'F:\spider\cmd100\chromedriver.exe'
+        opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        chromedriver = r'D:\cmd100\chromedriver.exe'
        browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
        url = "https://mp.weixin.qq.com/"
        browser.get(url)

--- a/comData/福布斯榜单/2022年福布斯榜单.xlsx
+++ b/comData/福布斯榜单/2022年福布斯榜单.xlsx
--- a/comData/福布斯榜单/get_employees.py
+++ b/comData/福布斯榜单/get_employees.py
 import json
@@ -3,7 +3,9 @@ import time
 import pymongo
-url = "https://web.archive.org/web/20230702131549/https://www.forbes.com/lists/global2000/"
+# url = "https://web.archive.org/web/20230702131549/https://www.forbes.com/lists/global2000/"
+url = "https://web.archive.org/web/20220929184024/https://www.forbes.com/lists/global2000/"
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '福布斯企业人数']
 headers = {
@@ -25,7 +27,7 @@ headers = {
 import requests
 from bs4 import BeautifulSoup
+requests.adapters.DEFAULT_RETRIES = 5
 proxies = {
    'https': 'http://127.0.0.1:1080',
    'http': 'http://127.0.0.1:1080',
@@ -46,7 +48,7 @@ with open('./a.txt', 'r', encoding='utf-8') as f:
    dataJson = f.read()
 dataJson = json.loads(dataJson)
 tableDates = dataJson['tableData']
-for tableDate in tableDates[894:]:
+for tableDate in tableDates:
    uri = tableDate['uri']
    rank = tableDate['rank']
@@ -79,4 +81,5 @@ for tableDate in tableDates[894:]:
    db_storage.insert_one(dic)
    print(f'{rank}==={organizationName}===已入库')
    req.close()
-    time.sleep(1)
+    # time.sleep(1)
\ No newline at end of file
+    break
\ No newline at end of file
--- a/comData/福布斯榜单/get_employees_2022.py
+++ b/comData/福布斯榜单/get_employees_2022.py
+import json
+import json
+import pandas as pd
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '2022年福布斯企业人数']
+url = 'https://web.archive.org/web/20220929184024/https://www.forbes.com/lists/global2000/'
+headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br, zstd',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+        'Cache-Control': 'max-age=0',
+        'Cookie': 'lux_uid=166447682647510727; donation-identifier=aab33e1c4e293a8fcd5490465688bb01; bafp=79fcddb0-4e71-11ee-8a81-b762f64bf85c',
+        'Priority': 'u=0, i',
+        'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': 'Windows"',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
+}
+proxies = {
+    'https': 'http://127.0.0.1:1080',
+    'http': 'http://127.0.0.1:1080',
+}
+@retry(tries=5, delay=2)
+def detail(href):
+    try:
+        req = requests.get(headers=headers, url=href, verify=False, proxies=proxies)
+        soup_ = BeautifulSoup(req.text, 'lxml')
+        scripts = soup_.find_all('script')
+        req.close()
+        return scripts
+    except:
+        raise
+@retry(tries=3, delay=2)
+def spider():
+    response = requests.get(url=url, headers=headers, proxies=proxies)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # print(soup)
+    tables = soup.find_all('div', class_="table-row-group")
+    print(len(tables))
+    for idx, table in enumerate(tables):
+        print(f'正在遍历第{idx}个table')
+        a_list = table.find_all('a', class_="table-row")
+        for a in a_list:
+            rank = a.find('div', class_="rank").text.replace('.', '')
+            print(f'排名： {rank}')
+            organizationName = a.find('div', class_="organizationName").text
+            href = a.get('href')
+            try:
+                scripts = detail(href)
+            except:
+                print(f'error--:{idx},{rank},{organizationName}')
+                item = str(idx) + ',' + rank + ',' + organizationName
+                with open('./error_2022.txt', 'a', encoding='utf-8')as f:
+                    f.write(item)
+                continue
+            # print(scripts)
+            for script in scripts:
+                if 'numberOfEmployees' in script.text:
+                    break
+                else:
+                    continue
+                    # print(f'{rank}--{uri}---not found')
+            try:
+                employeesJson = script.text
+                # print(employeesJson)
+                employeesJson = json.loads(employeesJson)
+                numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
+            except:
+                numberOfEmployees = '--'
+            dic = {
+                '排名': rank,
+                '企业名称': organizationName,
+                '员工人数': numberOfEmployees,
+            }
+            # print(dic)
+            db_storage.insert_one(dic)
+            print(f'{rank}==={organizationName}===已入库')
+def spider2():
+    # 读取excel
+    df = pd.read_excel('./2022年福布斯榜单.xlsx', sheet_name='待补充')
+    # 获取数据
+    data = df.values.tolist()
+    for idx, row in enumerate(data):
+        # 获取排名、公司名称、链接
+        rank = row[1]
+        organizationName = row[2]
+        # 将名称转化成小写
+        organizationName = organizationName.lower().replace(' ', '-')
+        href = f'https://web.archive.org/web/20220929184024/https://www.forbes.com/companies/{organizationName}/?list=global2000'
+        # 调用爬虫
+        try:
+            scripts = detail(href)
+        except:
+            print(f'error--:{idx},{rank},{organizationName}')
+            item = str(idx) + ',' + rank + ',' + organizationName
+            with open('./error_2022.txt', 'a', encoding='utf-8') as f:
+                f.write(item)
+            continue
+        # print(scripts)
+        for script in scripts:
+            if 'numberOfEmployees' in script.text:
+                break
+            else:
+                continue
+                # print(f'{rank}--{uri}---not found')
+        try:
+            employeesJson = script.text
+            # print(employeesJson)
+            employeesJson = json.loads(employeesJson)
+            numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
+        except:
+            numberOfEmployees = '--'
+        dic = {
+            '排名': rank,
+            '企业名称': organizationName,
+            '员工人数': numberOfEmployees,
+        }
+        # print(dic)
+        db_storage.insert_one(dic)
+        print(f'{rank}==={organizationName}===已入库')
+if __name__ == '__main__':
+    # spider()
+    spider2()
\ No newline at end of file
--- a/es拉取全球企业资讯/mongo导数据.py
+++ b/es拉取全球企业资讯/mongo导数据.py
+import pandas as pd
+import pandas as pd
+import pymongo
+# 7649
+data_list = []
+db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['全球企业资讯']
+# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
+# 导出标签是空的数据
+datas = db_stroage.find({"标签": ""})
+link = []
+for data in datas:
+    del data['_id']
+    del data['id']
+    if data['标题'] not in link:
+        data_list.append(data)
+        link.append(data['标题'])
+    # print(data)
+print(len(data_list))
+df = pd.DataFrame(data_list)
+df.to_excel('./不保留企业资讯.xlsx',index=False)
\ No newline at end of file
--- a/es拉取全球企业资讯/拉取不保留数据.py
+++ b/es拉取全球企业资讯/拉取不保留数据.py
+import json
+import json
+import re
+import threading
+import time
+import uuid
+import pymongo
+import redis
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore(sqlFlg=False)
+log = baseCore.getLogger()
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '智库-不保留222']
+lock = threading.Lock()
+class EsMethod(object):
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'subjectdatabase'
+    def queryatt(self,index_name,pnum):
+       body = {
+              "query": {
+                "bool": {
+                  "must": [
+                    {
+                      "match": {
+                        "subjectId": "1537739653432397825"
+                      }
+                    },
+                    {
+                      "match": {
+                        "deleteFlag": "1"
+                      }
+                    },
+                    {
+                    "range": {
+                      "createDate": {
+                        "gte": "2023-12-31T00:00:00",
+                        "lte": "2024-07-02T12:00:00"
+                      }
+                    }
+                    }
+                  ]
+                }
+              },
+              "sort": [
+                {
+                  "createDate": {
+                    "order": "desc"
+                  }
+                }
+             ],
+              "track_total_hits": True,
+              "size": 200,
+              "from": pnum
+            }
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , body=body)
+       # log.info(result)
+       return result
+def clean_html_tag(content):
+    # todo: 考虑正式场景中是以</p>进行段落划分的
+    ori_text = re.sub("(<\/p\s*>)", "\t", content)
+    # 处理图片标签
+    ori_text = re.sub(r"<img.*?/>", "", ori_text)
+    tag_content_list = ori_text.split("\t") if "<p" in ori_text else ori_text
+    temp_content_list = []
+    if type(tag_content_list) is list:
+        for text in tag_content_list:
+            bs = BeautifulSoup(text, 'lxml')
+            ori_match_content = bs.text.strip()
+            temp_content_list.append(ori_match_content)
+        match_content = "\n".join(temp_content_list)
+    else:
+        bs1 = BeautifulSoup(tag_content_list, 'lxml')
+        match_content = bs1.text.strip()
+        # if "参考文献" not in tag_content_list:
+        #     match_content = temp_content
+        # else:
+        #     match_content = temp_content.split("参考文献")[0]
+    return match_content
+def preprocess(text: str):
+    text = text.strip().strip('\n').strip()
+    text = re.sub(' +', '', text)
+    text = re.sub('\n+', '\n', text)
+    return text
+def main(page, p, esMethod):
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    total = result['hits']['total']['value']
+    # if total == 0:
+    #     log.info('++++已没有数据+++++')
+    #     return
+    try:
+        msglist = result['hits']['hits']
+    except:
+        log.info(f'error-----{result}')
+        return
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+    for mms in msglist:
+        id = mms['_id']
+        title = mms['_source']['title']
+        try:
+            content = mms['_source']['content']
+        except:
+            continue
+        try:
+            clean_content = clean_html_tag(content)
+            pre_content = preprocess(clean_content)
+        except:
+            pre_content = content
+        try:
+            summary = mms['_source']['summary']
+        except:
+            summary = ''
+        try:
+            clean_summary = clean_html_tag(summary)
+            pre_summary = preprocess(clean_summary)
+        except:
+            pre_summary = summary
+        try:
+            contentRaw = mms['_source']['contentRaw']
+        except:
+            contentRaw = ''
+        try:
+            clean_contentRaw = clean_html_tag(contentRaw)
+            pre_contentRaw = preprocess(clean_contentRaw)
+        except:
+            pre_contentRaw = contentRaw
+        try:
+            titleRaw = mms['_source']['titleRaw']
+        except:
+            titleRaw = ''
+        try:
+            summaryRaw = mms['_source']['summaryRaw']
+        except:
+            summaryRaw = ''
+        try:
+            clean_summaryRaw = clean_html_tag(summaryRaw)
+            pre_summaryRaw = preprocess(clean_summaryRaw)
+        except:
+            pre_summaryRaw = summaryRaw
+        contentWithTag = mms['_source']['contentWithTag']
+        log.info(f'{id}--{title}---')
+        # labels = mms['_source']['labels']
+        # tags = []
+        # for label in labels:
+        #     label_name = label['labelMark']
+        #     if label_name == "dynamic_tags":
+        #         relationName = label['relationName']
+        #         tags.append(relationName)
+        #     else:
+        #         continue
+        # info_tags = ','.join(tags)
+        # 存入数据库
+        dic = {
+            "id": id,
+            "标题": title,
+            "摘要": pre_summary,
+            "内容": pre_content,
+            "标题译文": titleRaw,
+            "摘要译文": pre_summaryRaw,
+            "内容译文": pre_contentRaw,
+            "正文html": contentWithTag,
+            "标签": '',
+            "状态": "通过",
+        }
+        db_storage.insert_one(dic)
+def run_threads(num_threads,esMethod,j):
+    threads = []
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+        threads.append(thread)
+        thread.start()
+    for thread in threads:
+        thread.join()
+if __name__ == "__main__":
+    j = 0
+    for i in range(9):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/es拉取全球企业资讯/拉取保留数据.py
+++ b/es拉取全球企业资讯/拉取保留数据.py
+import json
+import json
+import re
+import threading
+import time
+import uuid
+import pymongo
+import redis
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore(sqlFlg=False)
+log = baseCore.getLogger()
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '全球企业资讯0710']
+lock = threading.Lock()
+class EsMethod(object):
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'subjectdatabase'
+    def queryatt(self,index_name,pnum):
+       body = {
+              "query": {
+                "bool": {
+                  "must": [
+                    {
+                      "match": {
+                        "subjectId": "1734030182269853697"
+                      }
+                    },
+                    {
+                    "range": {
+                      "createDate": {
+                        "gte": "2024-07-01T00:00:00",
+                        "lte": "2024-07-11T00:00:00"
+                      }
+                    }
+                    }
+                  ]
+                }
+              },
+              "sort": [
+                {
+                  "createDate": {
+                    "order": "desc"
+                  }
+                }
+             ],
+              "track_total_hits": True,
+              "size": 200,
+              "from": pnum
+            }
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , body=body)
+       # log.info(result)
+       return result
+def clean_html_tag(content):
+    # todo: 考虑正式场景中是以</p>进行段落划分的
+    ori_text = re.sub("(<\/p\s*>)", "\t", content)
+    # 处理图片标签
+    ori_text = re.sub(r"<img.*?/>", "", ori_text)
+    tag_content_list = ori_text.split("\t") if "<p" in ori_text else ori_text
+    temp_content_list = []
+    if type(tag_content_list) is list:
+        for text in tag_content_list:
+            bs = BeautifulSoup(text, 'lxml')
+            ori_match_content = bs.text.strip()
+            temp_content_list.append(ori_match_content)
+        match_content = "\n".join(temp_content_list)
+    else:
+        bs1 = BeautifulSoup(tag_content_list, 'lxml')
+        match_content = bs1.text.strip()
+        # if "参考文献" not in tag_content_list:
+        #     match_content = temp_content
+        # else:
+        #     match_content = temp_content.split("参考文献")[0]
+    return match_content
+def preprocess(text: str):
+    text = text.strip().strip('\n').strip()
+    text = re.sub(' +', '', text)
+    text = re.sub('\n+', '\n', text)
+    return text
+def main(page, p, esMethod):
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    total = result['hits']['total']['value']
+    # if total == 0:
+    #     log.info('++++已没有数据+++++')
+    #     return
+    try:
+        msglist = result['hits']['hits']
+    except:
+        log.info(f'error-----{result}')
+        return
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+    for mms in msglist:
+        id = mms['_id']
+        title = mms['_source']['title']
+        try:
+            content = mms['_source']['content']
+        except:
+            continue
+        try:
+            contentWithTag = mms['_source']['contentWithTag']
+        except:
+            continue
+        try:
+            clean_content = clean_html_tag(content)
+            pre_content = preprocess(clean_content)
+        except:
+            pre_content = content
+        try:
+            summary = mms['_source']['summary']
+        except:
+            summary = ''
+        try:
+            clean_summary = clean_html_tag(summary)
+            pre_summary = preprocess(clean_summary)
+        except:
+            pre_summary = summary
+        try:
+            contentRaw = mms['_source']['contentRaw']
+        except:
+            contentRaw = ''
+        try:
+            clean_contentRaw = clean_html_tag(contentRaw)
+            pre_contentRaw = preprocess(clean_contentRaw)
+        except:
+            pre_contentRaw = contentRaw
+        try:
+            titleRaw = mms['_source']['titleRaw']
+        except:
+            titleRaw = ''
+        try:
+            summaryRaw = mms['_source']['summaryRaw']
+        except:
+            summaryRaw = ''
+        try:
+            clean_summaryRaw = clean_html_tag(summaryRaw)
+            pre_summaryRaw = preprocess(clean_summaryRaw)
+        except:
+            pre_summaryRaw = summaryRaw
+        log.info(f'{id}--{title}---')
+        labels = mms['_source']['labels']
+        tags = []
+        for label in labels:
+            label_name = label['labelMark']
+            if label_name == "dynamic_tags":
+                relationName = label['relationName']
+                tags.append(relationName)
+            else:
+                continue
+        info_tags = ','.join(tags)
+        # 存入数据库
+        dic = {
+            "id": id,
+            "标题": title,
+            "摘要": pre_summary,
+            "内容": pre_content,
+            "带标签内容": contentWithTag,
+            "标题译文": titleRaw,
+            "摘要译文": pre_summaryRaw,
+            "内容译文": pre_contentRaw,
+            "标签": info_tags,
+        }
+        db_storage.insert_one(dic)
+def run_threads(num_threads,esMethod,j):
+    threads = []
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+        threads.append(thread)
+        thread.start()
+    for thread in threads:
+        thread.join()
+if __name__ == "__main__":
+    j = 0
+    for i in range(2):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/百度采集/baidu_comm/baiduSpider_back.py
+++ b/百度采集/baidu_comm/baiduSpider_back.py