24/01/02

23d4dd76 · 薛凌堃 · be4f79be · 23d4dd76 · 23d4dd76 · 23d4dd76
--- a/REITs_policyData/reits.py
+++ b/REITs_policyData/reits.py
@@ -464,7 +464,8 @@ def zhengquanqihuo():
 #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
 def sse():
-    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
+    # url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
+    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_=1703469889542'
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
@@ -485,9 +486,13 @@ def sse():
    #     os.makedirs(path)
    for page in range(0, int(total_page)):
        t = int(time.time())
-        url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=24278800487459370386559742313666&_={t}'
+        url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C12632&trackId=00752019013296307464953343505659&_={t}'
        data = policy.getrequest_json(headers, url_page)
        newslist = data['data']['knowledgeList']
+        # if newslist:
+        #     pass
+        # else:
+        #     continue
        # print(newslist)
        for news in newslist:
            num += 1
@@ -521,8 +526,8 @@ def sse():
                    content = ''
                    response = requests.get(newsUrl, timeout=20)
                    with fitz.open(stream=response.content, filetype='pdf') as doc:
-                        for page in doc.pages():
+                        for page_ in doc.pages():
-                            content += page.get_text()
+                            content += page_.get_text()
                    file_href = newsUrl
                    file_name = title
@@ -628,7 +633,7 @@ def sse():
                        for att_id in id_list:
                            baseCore.deliteATT(att_id)
            except Exception as e:
-                log.info(f"error！！！{newsUrl}")
+                log.info(f"error！！！{newsUrl}===={title}")
                log.info(e)
        log.info(f'====第{page}页====处理结束，================')
@@ -972,14 +977,14 @@ def guizhou():
 if __name__=="__main__":
    # file_path = f'data/REITs贵州省人民政府.xlsx'
    # wb = policy.createfile(file_path)
-    reform()
+    # reform()
-    # shenzhen()
+    # # shenzhen()
-    zhengquanqihuo()
+    # zhengquanqihuo()
    try:
        sse()
    except:
        pass
-    hebei()
+    # hebei()
-    guizhou()
+    # guizhou()
 # zhengquanqihuo()
\ No newline at end of file
--- a/REITs_policyData/start.py
+++ b/REITs_policyData/start.py
@@ -9,7 +9,7 @@ import LawRules_shenzhen, LawRules_2_shenzhen
 from REITs_policyData.policy_beijing import beijing
-if __name__ == "__mian__":
+if __name__ == "__main__":
    beijing()
    reits.sse()
    reits.reform()

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -674,7 +674,7 @@ if __name__ == "__main__":
    # BaseInfoEnterprise()
    # FBS()
    # MengZhi()
-    # NQEnterprise()
+    NQEnterprise()
    # SEC_CIK()
    # dujioashou()
    # omeng()
@@ -683,6 +683,6 @@ if __name__ == "__main__":
    # AnnualEnterprise_task()
    # FinanceFromEast()
    # ipo_code()
-    JingyingfenxiFromEase()
+    # JingyingfenxiFromEase()
    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')
--- a/comData/YanBao/att_id.py
+++ b/comData/YanBao/att_id.py
@@ -43,7 +43,7 @@ class EsMethod(object):
                   "must": [
                       {
                           "match": {
-                               "type": "1"
+                               "type": "0"
                           }
                       }
                   ]
@@ -115,7 +115,7 @@ def main(page, p, esMethod):
        attid = mms['_source']['attachmentIds'][0]
        log.info(f'{id}-{attid}--{title}--{sourceAddress}---')
-        selects = secrchATT('1', attid)
+        selects = secrchATT('4', attid)
        if selects:
            pass
        else:

--- a/comData/annualReport_US/uploadfile.py
+++ b/comData/annualReport_US/uploadfile.py
@@ -53,12 +53,12 @@ class EsMethod(object):
                      # 'hits.hits._source.createDate',
                      # 'hits.hits._source.publishDate',
                      ]  # 字段2
-       result = self.es.search(index=index_name
+       resultb = self.es.search(index=index_name
                               , doc_type='_doc'
                               , filter_path=filter_path
                               , body=body)
       # log.info(result)
-       return result
+       return resultb
    def updateaunn(self, index_name, id, content, contentWithTag):
        body = {
@@ -67,24 +67,28 @@ class EsMethod(object):
                'contentWithTag': contentWithTag
            }
        }
-        result = self.es.update(index=index_name
+        resulta = self.es.update(index=index_name
                                ,id=id
                                ,body=body)
-        log.info('更新结果:%s' % result)
+        log.info('更新结果:%s' % resulta)
 def paserUrl(html,listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
+    print(len(links))
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
+        print(link)
        if 'href' in link.attrs:
-            link['href'] = urljoin(listurl, link['href'])
+            # link['href'] = urljoin(listurl, link['href'])
+            pass
        elif 'src' in link.attrs:
-            link['src'] = urljoin(listurl, link['src'])
+            pass
+            # link['src'] = urljoin(listurl, link['src'])
    return html
-def get_news(news_url,ip_dic):
+def get_news(news_url,sourceAddress,id):
    header = {
        'Host': 'www.sec.gov',
        'Connection': 'keep-alive',
@@ -102,30 +106,44 @@ def get_news(news_url,ip_dic):
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
    }
-    response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
+    response = requests.get(url=news_url,headers=header,verify=False)
+    # aa = response.text
+    # print(response.text)
    # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
-        result = BeautifulSoup(response.content,'html.parser')
+        # result_ = BeautifulSoup(response.content,'html.parser')
+        result_ = BeautifulSoup(response.text, 'lxml')
        # print(result)
        pass
    else:
        # 请求失败，输出错误信息
        log.info('请求失败:', response.status_code, response.text)
-        result = ''
+        result_ = ''
-    return result
+    if result_:
+        pass
+    # 相对路径转化为绝对路径
+    # soup = paserUrl(result_, sourceAddress)
+    time.sleep(2)
+    content = result_.text.strip()
+    # del(result_)
+    # content = result_
+    # print(content)
+    time.sleep(2)
+    esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))
 def main(esMethod):
    redis_conn = redis.Redis(connection_pool=pool)
    id_ = redis_conn.lpop('NianbaoUS:id')
-    id = id_.decode()
    # id = "23101317164"
-    if id:
+    if id_:
        pass
    else:
        log.info('已无数据')
-        return
+        return False
+    id = id_.decode()
    result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
    result = result_['hits']['hits'][0]
    num = 0
@@ -135,17 +153,8 @@ def main(esMethod):
    log.info(f'====={title}=={social_code}===正在更新===')
    sourceAddress = result['_source']['sourceAddress']
    ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
-    soup = get_news(sourceAddress,ip_dic)
+    get_news(sourceAddress,sourceAddress,id)
-    if soup:
+    return True
-        pass
-    else:
-        return
-    # 相对路径转化为绝对路径
-    soup = paserUrl(soup, sourceAddress)
-    content = soup.text.strip()
-    esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
-    return
 def run_threads(num_threads,esMethod):
@@ -164,6 +173,9 @@ if __name__ == '__main__':
    while True:
        esMethod = EsMethod()
        start = time.time()
-        num_threads = 5
+        # num_threads = 5
-        run_threads(num_threads,esMethod)
+        # run_threads(num_threads,esMethod)
-        log.info(f'5线程 总耗时{time.time()-start}秒')
+        # log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file
+        result = main(esMethod)
+        if not result:
+            break
\ No newline at end of file
--- a/comData/bond_zjh/zhaiquan.py
+++ b/comData/bond_zjh/zhaiquan.py
-# 证监会沪市、gong深市 公司债券和企业债券采集
+# 证监会沪市、深市 公司债券和企业债券采集
-"""
-证监会企业名单
-"""
 import time
 import random
 import requests
@@ -25,7 +22,7 @@ cursor = baseCore.cursor
 cnx_ = baseCore.cnx_
 cursor_ = baseCore.cursor_
-taskType = '企业名单/证监会'
+taskType = '企业债券/证监会'
 def createDriver():
    chrome_driver = r'D:\cmd100\chromedriver.exe'
@@ -136,7 +133,8 @@ def SpiderByZJH(url, start_time):  # dic_info 数据库中获取到的基本信
    page = soup.find('div', class_='pages').find_all('li')[-1]
    total = page.find('b').text
-    for i in range(1,int(total)+1):
+    # for i in range(1,int(total)+1):
+    for i in range(224, 225):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
@@ -241,7 +239,7 @@ if __name__ == '__main__':
        # url_parms = ['201010', '201014']
        # url_parms = ['201011', '201013']
        url_parms = ['201411', '201414', '202011', '202014']
-        # url_parms = ['202011', '202014']
+        # url_parms = ['201411']
        for url_parm in url_parms:
            url = getUrl(url_parm)

--- a/comData/caiwushuju/yfinance_.py
+++ b/comData/caiwushuju/yfinance_.py
+import yfinance as yf
+# 获取股票数据
+stock = yf.Ticker("MET")
+# 获取资产负债表数据
+balance_sheet = stock.balance_sheet
+# 获取报告日期
+report_dates = balance_sheet.index
+print(report_dates)
+# 获取现金流量表数据
+cashflow_statement = stock.cashflow
+# 获取利润表数据
+income_statement = stock.financials
+print(balance_sheet)
+print(cashflow_statement)
+print(income_statement)
+# import yfinance as yf
+#
+# # 获取股票数据
+# stock = yf.Ticker("AAPL")
+#
+# # 获取历史价格数据
+# historical_prices = stock.history(period="max")
+#
+# # 获取市值数据
+# market_cap = stock.info["marketCap"]
+#
+# print(historical_prices)
+# print(market_cap)
+# import yfinance as yf
+#
+# # 获取股票数据
+# stock = yf.Ticker("AAPL")
+#
+# # 获取历史价格数据
+# historical_prices = stock.history(period="max")
+#
+# # 获取市值数据
+# market_cap = stock.info["marketCap"]
+#
+# print(historical_prices)
+# print(market_cap)
--- a/comData/dingzhi/bmfw.py
+++ b/comData/dingzhi/bmfw.py
@@ -57,8 +57,8 @@ def page_list():
        'Content-Length': '25',
        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
        'x-tif-did': 'u8Ajuqdyap',
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
-        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
+        'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
        'Content-Type': 'application/json',
        'xweb_xhr': '1',
        'dgd-pre-release': '0',
@@ -69,11 +69,11 @@ def page_list():
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
-        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
+        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
-    for i in range(1,453):
+    for i in range(1,2):
        log.info(f'采集第{i}页数据')
        k=i
        da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
@@ -110,8 +110,8 @@ def detailpaser(dmsg):
        'Content-Length': '25',
        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
        'x-tif-did': 'u8Ajuqdyap',
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x63090819)XWEB/8519',
-        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
+        'x-tif-sid': 'ee270e93c3636dc3f281da8e0603db6a63',
        'Content-Type': 'application/json',
        'xweb_xhr': '1',
        'dgd-pre-release': '0',
@@ -122,7 +122,7 @@ def detailpaser(dmsg):
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
-        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
+        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/750/page-frame.html',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    try:

--- a/comData/dingzhi/see_measures.py
+++ b/comData/dingzhi/see_measures.py
+import json
+import time
+import uuid
+import pymysql
+import redis
+import requests
+from kafka import KafkaProducer
+import urllib3
+urllib3.disable_warnings()
+from obs import ObsClient
+import fitz
+import sys
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
+# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
+cnx = baseCore.cnx_
+obsClient = ObsClient(
+    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+)
+pathType = 'CrowDingZhi/'
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Connection': 'keep-alive',
+    'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
+    'Host': 'query.sse.com.cn',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+}
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units) - 1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
+# 数据入库，返回主键id传到kafka中
+def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
+                create_by, create_time, come, page_size):
+    with cnx.cursor() as cursor:
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        values = (
+        year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
+        create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
+        # log.info(values)
+        cursor.execute(Upsql, values)  # 插入
+        cnx.commit()  # 提交
+        querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s'''  # and stock_code = "01786.HK"
+        cursor.execute(querySql, full_path)
+        selects = cursor.fetchone()
+        pdf_id = selects[0]
+    # cnx.close()
+    # log.info("更新完成:{}".format(pdf_id))
+    return pdf_id
+def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
+    retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
+               'full_path': '',
+               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    for i in range(0, 3):
+        try:
+            ip = baseCore.get_proxy()
+            # response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            response = requests.get(pdf_url)
+            file_size = int(response.headers.get('Content-Length'))
+            break
+        except:
+            time.sleep(3)
+            continue
+    for i in range(0, 3):
+        try:
+            name = str(getuuid()) + '.' + category
+            now_time = time.strftime("%Y-%m")
+            result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+            if category == 'pdf':
+                with fitz.open(stream=response.content, filetype='pdf') as doc:
+                    page_size = doc.page_count
+                    for page in doc.pages():
+                        retData['content'] += page.get_text()
+                break
+            else:
+                page_size = 0
+                retData['content'] = ''
+                break
+        except Exception as e:
+            time.sleep(3)
+            continue
+    try:
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        retData['state'] = True
+        retData['path'] = result['body']['objectUrl'].split('.com')[1]
+        retData['full_path'] = result['body']['objectUrl']
+        retData['file_size'] = convert_size(file_size)
+        retData['create_time'] = time_now
+        retData['page_size'] = page_size
+    except Exception as e:
+        log.info(f'error---{e}')
+        return retData
+    return retData
+if __name__ == "__main__":
+    num = 0
+    t = int(time.time()*1000)
+    url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
+    req_ = requests.get(url=url_, headers=headers)
+    data_json = req_.json()
+    print(data_json)
+    pageCount = data_json['pageHelp']['pageCount']
+    for i in range(1,int(pageCount + 1)):
+        url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
+        req = requests.get(url=url, headers=headers)
+        data_list = req.json()['result']
+        for info in data_list:
+            publishDate = info['cmsOpDate']  # 处理日期
+            year = publishDate[:4]
+            com = '上海证券交易所'
+            docTitle = info['docTitle']  # 处理事由
+            docType = info['docType']  # 文档类型
+            docURL = "http://" + info['docURL']  # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
+            flag = r.sismember('IN-20231227-0001', docURL)
+            if flag:
+                log.info('信息已采集入库过')
+                continue
+            # 上传至obs
+            retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
+            if retData['state']:
+                pass
+            else:
+                log.info(f'====pdf解析失败====')
+                continue
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            page_size = retData['page_size']
+            path = retData['path']
+            full_path = retData['full_path']
+            file_size = retData['file_size']
+            create_by = retData['create_by']
+            content = retData['content']
+            status = 1
+            num += 1
+            create_time = time_now
+            # 上传到附件表
+            att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
+            if att_id:
+                pass
+            else:
+                continue
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            sid = '1739914218978594817'
+            info_code = "IN-20231227-0001"
+            dic_news = {
+                'attachmentIds': str(att_id),
+                'content': content,
+                'contentWithTag': '',
+                'id': '',
+                'origin': com,
+                'publishDate': publishDate,
+                'sid': sid,
+                'sourceAddress': docURL,
+                'title': docTitle,
+                'source':'16',
+                'type': ''
+            }
+            # 将相应字段通过kafka传输保存
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                kafka_result = producer.send("crawlerInfo",
+                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+                log.info(kafka_result.get(timeout=10))
+            except Exception as e:
+                log.info(e)
+                log.info(f'传输失败：{dic_news["title"]}、{dic_news["publishDate"]}')
+            dic_result = {
+                'success': 'ture',
+                'message': '操作成功',
+                'code': '200',
+            }
+            log.info(dic_result)
+            r.sadd(info_code, docURL)
+            continue
--- a/comData/important_meeting/zyqmshggldxzhy19.py
+++ b/comData/important_meeting/zyqmshggldxzhy19.py
+# 中央全面深化改革委员会会议
+import json
+import time
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+from kafka import KafkaProducer
+headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'max-age=0',
+        'Connection': 'keep-alive',
+        'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
+        'Host': 'www.12371.cn',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+}
+if __name__ == "__main__":
+        # 中央全面深化改革委员会会议
+        # 中央全面深化改革领导小组会议
+        # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
+        url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
+        for url in url_list:
+                request = requests.get(url=url, headers=headers)
+                soup = BeautifulSoup(request.content, 'html.parser')
+                request.encoding = request.apparent_encoding
+                # print(soup)
+                info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
+                ul_list = info_html.find_all('li')
+                for ul in ul_list:
+                        publishDate_ = str(ul.find('span').text)
+                        date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
+                        publishDate = date_obj.strftime('%Y-%m-%d')
+                        year = int(publishDate[:4])
+                        if year < 2023:
+                                continue
+                        newsUrl = ul.find('a')['href']
+                        summary = ul.find('a').text
+                        # todo: 链接判重
+                        news_request = requests.get(url=newsUrl, headers=headers)
+                        news_soup = BeautifulSoup(news_request.content, 'html.parser')
+                        print(news_soup)
+                        title = news_soup.find('h1', class_='big_title').text
+                        source = news_soup.find('div', class_='title_bottom').find('i').text
+                        contentwithTag = news_soup.find('div', class_='word')
+                        content = contentwithTag.text
+                        if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
+                                sid = '1691633319715676162'
+                        else:
+                                sid = '1691633869186277378'
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        dic_info ={
+                                'id': '1681549361661489154' + str(int(time.time()*1000)),
+                                'title': title,
+                                'origin': source,
+                                'contentWithTag': str(contentwithTag),
+                                'content': content,
+                                'summary': summary,
+                                'publishDate': publishDate,
+                                'sid': sid,
+                                'subjectId': '1681549361661489154',
+                                'sourceAddress':newsUrl,
+                                'checkStatus': 1,
+                                'deleteFlag': 0,
+                                'createDate': time_now,
+                        }
+                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                        try:
+                                kafka_result = producer.send("research_center_fourth",
+                                                             json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
+                                # r.sadd(info_code + '-test', sourceAddress)
+                                print('发送kafka结束')
+                        except Exception as e:
+                                print(e)
+                                print('发送kafka异常！')
+                        finally:
+                                producer.close()
\ No newline at end of file
--- a/comData/policylaw/deletedup.py
+++ b/comData/policylaw/deletedup.py
@@ -27,29 +27,19 @@ class EsMethod(object):
    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
-        self.index_name = 'policy'
+        self.index_name = 'researchreportdata'
    def queryatt(self,index_name,pnum):
       body = {
            "query": {
               "bool": {
                   "must": [
                       {
-          "nested" : {
+                           "term": {
-          "query" : {
+                               "sid.keyword": {
-            "bool" : {
+                                   "value": "1662008524476948481"
-              "must" : [
-                {
-                  "match_phrase" : {
-                    "labels.relationId" : {
-                      "query" : "1698"
-                    }
-                  }
                               }
-              ]
-            }
-          },
-          "path" : "labels"
                           }
                       }
                   ]
@@ -112,7 +102,7 @@ def main(page, p, esMethod):
    unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
    # 删除重复的文档
    for doc_id in unique_document_ids:
-        esMethod.delete(index_name="policy", id=doc_id)
+        esMethod.delete(index_name="researchreportdata", id=doc_id)

--- a/comData/shangbiao/tyc_shangbiao.py
+++ b/comData/shangbiao/tyc_shangbiao.py
+# 天眼查商标申请数量
+# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
+# 请求方式 POST
+import requests,time,re,random
+from base import BaseCore
+import pandas as pd
+from bs4 import BeautifulSoup as bs
+from comData.Tyc.getTycId import getTycIdByXYDM
+baseCore = BaseCore.BaseCore()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+log = baseCore.getLogger()
+taskType = '天眼查商标/国内上市'
+header = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Host': 'capi.tianyancha.com',
+        'Origin': 'https://www.tianyancha.com',
+        'Referer': 'https://www.tianyancha.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
+        'X-TYCID': '6f6298905d3011ee96146793e725899d',
+        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'version': 'TYC-Web'
+    }
+if __name__ == "__main__":
+    while True:
+        start_time = time.time()
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
+        # social_code = '91350700856994874M'
+        # 判断 如果Redis中已经没有数据，则等待
+        if social_code == None:
+            # time.sleep(20)
+            break
+        start = time.time()
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                pass
+            else:
+                # 数据重新塞入redis
+                baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
+                continue
+            id = data[0]
+            com_name = data[1]
+            xydm = data[2]
+            tycid = data[11]
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(xydm)
+                    if retData['tycData'] and retData['reput']:
+                        tycid = retData['tycData']['id']
+                        # todo:写入数据库
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor.execute(updateSql)
+                        cnx.commit()
+                    elif not retData['tycData'] and retData['reput']:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={social_code}====重新放入redis====')
+                        baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
+                        continue
+                    elif not retData['reput'] and not retData['tycData']:
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
+                    continue
+            # count = data[17]
+            log.info(f"{id}---{xydm}----{tycid}----开始处理")
+            t = int(time.time()*1000)
+            # url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
+            url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
+            # tycid = '209252214'
+            # payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
+            #            "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
+            #            "sortType": "-100"}
+            request = requests.get(url=url, headers=header, verify=False)
+            # request = requests.post(url=url, headers=header, data=payload)
+            # print(request.text)
+            data_json = request.json()
+            # print(data_json)
+            try:
+                all_data = data_json['data']['applyYearGraph']['statisticGraphData']
+            except:
+                dic_info = {
+                    '企业名称': com_name,
+                    '统一信用代码': social_code,
+                }
+                selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
+                cursor.execute(selectSql)
+                count = cursor.fetchone()[0]
+                if count > 0:
+                    log.info(f"{com_name}----已经存在---无商标数据")
+                    continue
+                else:
+                    values_tuple = tuple(dic_info.values())
+                    # log.info(f"{gpdm}-------{companyname}---新增")
+                    insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
+                    cursor.execute(insertSql, values_tuple)
+                    cnx.commit()
+                    log.info(f"{com_name}-----新增---无商标数据")
+                continue
+            for info in all_data:
+                year = info['desc']
+                num = info['num']  # 申请商标数量
+                dic_info = {
+                    '企业名称': com_name,
+                    '统一信用代码': social_code,
+                    '年份': year,
+                    '数量': num
+                }
+                selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
+                cursor.execute(selectSql)
+                count = cursor.fetchone()[0]
+                if count > 0:
+                    log.info(f"{com_name}-------{year}---已经存在")
+                    continue
+                else:
+                    values_tuple = tuple(dic_info.values())
+                    # log.info(f"{gpdm}-------{companyname}---新增")
+                    insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
+                    cursor.execute(insertSql, values_tuple)
+                    cnx.commit()
+                    log.info(f"{com_name}-------{year}---新增")
+                time.sleep(2)
+                # list_all_info.append(dic_info)
+            log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
+        except Exception as e:
+            log.info(f'==={social_code}=====获取企业信息失败==={e}=')
+            # 重新塞入redis
+            baseCore.rePutIntoR('ShangBiao:gnshSocial_code', social_code)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)
--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -56,7 +56,7 @@ if __name__=="__main__":
    url = "https://mp.weixin.qq.com/"
    browser.get(url)
    # 可改动
-    time.sleep(10)
+    time.sleep(20)
    s = requests.session()
    #获取到token和cookies

--- a/comData/weixin_solo/oneWeixin2.py
+++ b/comData/weixin_solo/oneWeixin2.py
@@ -239,6 +239,8 @@ if __name__=="__main__":
    list_all_info = []
    while True:
        #一次拿取一篇文章
+        # todo: 从redis拿数据 更新mysql状态
        dict_json  =getjsonInfo()
        if dict_json:
            if get_info(dict_json):

--- a/comData/weixin_solo/wxList.py
+++ b/comData/weixin_solo/wxList.py
@@ -113,7 +113,7 @@ def insertWxList(dic_url,json_search,page):
                cnx_.commit()
            except Exception as e:
                log.error(f"保存数据库失败：{e}")
+            # todo: 放入redis
    log.info(f"---{dic_url['name']}--第{page}页----总数：{listCount}---重复数：{repetCount}---新增数：{insertCount}-------------")
    if listCount==0:
        #列表为空认为结束

--- a/comData/zhuanli/500qiang_zhuanli.py
+++ b/comData/zhuanli/500qiang_zhuanli.py
+from bs4 import BeautifulSoup
+import requests,time,re
+from base import BaseCore
+# import pandas as pd
+baseCore = BaseCore.BaseCore()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+cnx_ = baseCore.cnx_
+cursor_ = baseCore.cursor_
+log = baseCore.getLogger()
+taskType = '500强专利'
+# headers = {
+#     "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
+#     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
+# }
+# df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
+# for i in range(2022,1890,-1):
+#     df_all[f'{i}'] = ''
+# df_all['Espacenet专利检索'] = ''
+headers = {
+    "Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
+}
+def name_handle(english_name_):
+    if 'INC.' in english_name_ or 'LTD.' in english_name_ or 'CO.' in english_name_ \
+            or 'CORP.' in english_name_ or 'GMBH' in english_name_ \
+            or ' AG' in english_name_ or 'SARL' in english_name_ or 'S.A.' in english_name_ \
+            or 'PTY' in english_name_ or 'LLC' in english_name_ or 'LLP' in english_name_ \
+            or ' AB' in english_name_ or ' NV' in english_name_ or 'N.V.' in english_name_ \
+            or 'A.S.' in english_name_ or ' SA' in english_name_ or ',Limited' in english_name_ \
+            or ' SE' in english_name_ or ' LPC' in english_name_ or 'S.P.A.' in english_name_:
+        english_name = english_name_.replace('INC.', '').replace('LTD.', '').replace('CO.', '').replace('CORP.', '') \
+            .replace('GMBH', '').replace(' AG', '').replace('SARL', '').replace('S.A.', '').replace('PTY', '') \
+            .replace('LLC', '').replace('LLP', '').replace(' AB', '').replace(' NV', '').replace(',', '') \
+            .replace('A.S.', '').replace(' SA', '').replace(',Limited', '').replace(' SE', '').replace(' PLC', '') \
+            .replace('N.V.', '').replace('S.P.A.', '').rstrip()
+        return english_name
+    else:
+        english_name = english_name_
+        return english_name
+if __name__ == '__main__':
+    while True:
+        start_time = time.time()
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        social_code = baseCore.redicPullData('ZhuanLi:gwSocial_code')
+        # social_code = '9111000071093123XX'
+        # 判断 如果Redis中已经没有数据，则等待
+        if social_code == None:
+            # time.sleep(20)
+            break
+        start = time.time()
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                pass
+            else:
+                # 数据重新塞入redis
+                baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
+                continue
+            id = data[0]
+            com_name = data[1]
+            xydm = data[2]
+            english_name_ = data[5]
+            place = data[6]
+            if place == 1:
+                log.info(f'{com_name}--国内')
+                baseCore.rePutIntoR('ZhuanLi_500:zgSocial_code',social_code)
+                continue
+            if english_name_:
+                pass
+            else:
+                query = f"select * from sys_base_enterprise where social_credit_code ='{xydm}'"
+                cursor_.execute(query)
+                reslut = cursor_.fetchone()
+                english_name_ = reslut[32]
+                # todo:将该字段更新到144企业库
+                update_ = f"update EnterpriseInfo set EnglishName='{english_name_}' where SocialCode='{xydm}' "
+                cursor.execute(update_)
+                cnx.commit()
+            english_name_ = english_name_.upper()
+            english_name = name_handle(english_name_)
+            num_zhuanli = 0
+            # url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
+            #     int(float(time.time()) * 1000))
+            #
+            # res1 = requests.get(url1, headers=headers)
+            # soup1 = BeautifulSoup(res1.content, 'html.parser')
+            #
+            # num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
+            #
+            # try:
+            #     zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
+            # except:
+            #     zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
+            # if zhuanli:
+            for year in range(2023, 1900, -1):
+                url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={english_name}&IN=&CPC=&IC=&rnd=' + str(
+                    int(float(time.time()) * 1000))
+                # url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
+                ip = baseCore.get_proxy()
+                res = requests.get(url, headers=headers, proxies=ip)
+                soup = BeautifulSoup(res.content, 'html.parser')
+                num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
+                try:
+                    try:
+                        zhuanli = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
+                    except:
+                        zhuanli = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
+                except:
+                    zhuanli = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
+                if zhuanli == 0:
+                    dic_info = {
+                        'com_name': com_name,
+                        'social_code': social_code,
+                    }
+                    # 插入数据库表中
+                    selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' "
+                    cursor.execute(selectSql)
+                    count = cursor.fetchone()[0]
+                    if count > 0:
+                        log.info(f"{com_name}-----已经存在--{year}--无专利信息")
+                        break
+                    else:
+                        values_tuple = tuple(dic_info.values())
+                        # log.info(f"{gpdm}-------{companyname}---新增")
+                        insertSql = f"insert into zhuanli_500(com_name,social_code) values (%s,%s)"
+                        cursor.execute(insertSql, values_tuple)
+                        cnx.commit()
+                        log.info(f"{com_name}------新增----无专利信息")
+                    break
+                dic_info = {
+                    'com_name': com_name,
+                    'social_code': social_code,
+                    'year': year,
+                    'num': zhuanli
+                }
+                # 插入数据库表中
+                selectSql = f"select count(1) from zhuanli_500 where social_code='{xydm}' and year='{year}' "
+                cursor.execute(selectSql)
+                count = cursor.fetchone()[0]
+                if count > 0:
+                    log.info(f"{com_name}-------{year}---已经存在")
+                    continue
+                else:
+                    values_tuple = tuple(dic_info.values())
+                    # log.info(f"{gpdm}-------{companyname}---新增")
+                    insertSql = f"insert into zhuanli_500(com_name,social_code,year,num) values (%s,%s,%s,%s)"
+                    cursor.execute(insertSql, values_tuple)
+                    cnx.commit()
+                    log.info(f"{com_name}-------{year}---新增")
+        except:
+            log.info("error!{}".format(social_code))
+            baseCore.rePutIntoR('ZhuanLi:gwSocial_code', social_code)
+            continue
\ No newline at end of file
--- a/comData/zhuanli/guowai_zhuanli.py
+++ b/comData/zhuanli/guowai_zhuanli.py
+import requests,re,time,os,datetime,random
+import pandas as pd
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import redis
+# headers = {
+#     "Cookie":"currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
+#     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
+# }
+df_all = pd.read_excel('D:\\kkwork\\jupyter\\专利数量\\t1.xlsx')
+# for i in range(2022,1890,-1):
+#     df_all[f'{i}'] = ''
+# df_all['Espacenet专利检索'] = ''
+headers = {
+    "Cookie": "currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2Fdata%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%25202021%25202020%25202019%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26rnd%3D1663641959596; PGS=10; _pk_id.93.72ee=ee83303e45a089a1.1663061058.; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=cn_EP; _pk_ses.93.72ee=1; LevelXLastSelectedDataSource=EPODOC; menuCurrentSearch=%2F%2Fworldwide.espacenet.com%2FsearchResults%3FAB%3D%26AP%3D%26CPC%3D%26DB%3DEPODOC%26IC%3D%26IN%3D%26PA%3Dapple%26PD%3D2022%26PN%3D%26PR%3D%26ST%3Dadvanced%26Submit%3D%E6%A3%80%E7%B4%A2%26TI%3D%26locale%3Dcn_EP; currentUrl=https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3Fsubmitted%3Dtrue%26locale%3Dcn_EP%26DB%3DEPODOC%26ST%3Dadvanced%26TI%3D%26AB%3D%26PN%3D%26AP%3D%26PR%3D%26PD%3D2022%26PA%3Dapple%26IN%3D%26CPC%3D%26IC%3D%26Submit%3D%25E6%25A3%2580%25E7%25B4%25A2; JSESSIONID=qnLt7d5QgBtpUGjMmHuD-kwJ.espacenet_levelx_prod_2",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
+}
+for i in range(len(df_all['英文名称'])):
+    for num in range(0, 2):
+        try:
+            if '中国' not in df_all['企业所属国家'][i]:
+                com_name = df_all['英文名称'][i]
+                num_zhuanli = 0
+                url1 = f'https://worldwide.espacenet.com/data/searchResults?ST=singleline&locale=cn_EP&submitted=true&DB=&query={com_name}&rnd=' + str(
+                    int(float(time.time()) * 1000))
+                res1 = requests.get(url1, headers=headers)
+                soup1 = BeautifulSoup(res1.content, 'html.parser')
+                num_text = soup1.find('p', {'class': 'numResultsFoundMsg'}).text
+                # try:
+                #     zhuanli = re.findall("约(.*?)个", num_text)[0].replace(',', '')
+                # except:
+                #     zhuanli = re.findall("多于(.*?)个", num_text)[0].replace(',', '')
+                zhuanli = '10000'
+                if zhuanli == '10000':
+                    for year in range(2023, 1900, -1):
+                        # url = f'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD={year}&PA={com_name}&IN=&CPC=&IC=&rnd=' + str(
+                        #     int(float(time.time()) * 1000))
+                        url = 'https://worldwide.espacenet.com/data/searchResults?submitted=true&locale=cn_EP&DB=EPODOC&ST=advanced&TI=&AB=&PN=&AP=&PR=&PD=2022&PA=APPLE&IN=&CPC=&IC=&rnd=1703643229331'
+                        res = requests.get(url, headers=headers)
+                        soup = BeautifulSoup(res.content, 'html.parser')
+                        num_text = soup.find('p', {'class': 'numResultsFoundMsg'}).text
+                        try:
+                            try:
+                                zhuanli2 = int(re.findall("约(.*?)个", num_text)[0].replace(',', ''))
+                            except:
+                                zhuanli2 = int(re.findall("多于(.*?)个", num_text)[0].replace(',', ''))
+                        except:
+                            zhuanli2 = int(re.findall("找到(.*?)个", num_text)[0].replace(',', ''))
+                        if zhuanli2 == 0:
+                            break
+                        df_all[f'{year}'][i] = zhuanli2
+                        # num_zhuanli = num_zhuanli + zhuanli2
+                        num_zhuanli = num_zhuanli + zhuanli2
+                        print(year)
+                        time.sleep(random.uniform(1.5, 2))
+                else:
+                    num_zhuanli = int(zhuanli)
+                    time.sleep(random.uniform(1.5, 2))
+                df_all['Espacenet专利检索'][i] = num_zhuanli
+                print(f"{com_name} : {num_zhuanli}")
+                break
+        except:
+            if num == 0:
+                print("重试")
+                time.sleep(60)
+                continue
+            else:
+                print("error!{}".format(df_all['英文名称'][i]))
\ No newline at end of file
--- a/comData/zhuanli/tyc_zhuanli.py
+++ b/comData/zhuanli/tyc_zhuanli.py
@@ -53,7 +53,27 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
                time.sleep(2)
                continue
    # print(res_j)
+    try:
        list_all = res_j['data']['items']
+    except:
+        dic_info = {
+            '企业名称': com_name,
+            '统一信用代码': social_code
+        }
+        selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
+        cursor.execute(selectSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info(f"{com_name}---{social_code}---已经存在---无专利")
+            return 0
+        else:
+            values_tuple = tuple(dic_info.values())
+            # log.info(f"{gpdm}-------{companyname}---新增")
+            insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
+            cursor.execute(insertSql, values_tuple)
+            cnx.commit()
+            log.info(f"{com_name}---{social_code}---新增---无专利")
+        return 0
    # print(list_all)
    if list_all:
        for one_zhuanli in list_all:
@@ -140,7 +160,7 @@ if __name__ == "__main__":
        list_all_info = []
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
-        # social_code = '9111010566840059XP'
+        # social_code = '91350700856994874M'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            # time.sleep(20)

--- a/qiushi_leaderspeech.py
+++ b/qiushi_leaderspeech.py
@@ -113,23 +113,23 @@ if __name__=='__main__':
                        author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
                    except:
                        continue
-                    # if len(author)>4:
+                    if len(author)>4:
-                    #     continue
+                        continue
                    # if '（' in author or '本刊' in author or '国家' in author\
                    #     or '中共' in author or '记者' in author or '新闻社' in author\
                    #     or '党委' in author or '调研组' in author or '研究中心' in author\
                    #     or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
-                    # if '（' in author or '本刊' in author  \
+                    if '（' in author or '本刊' in author  \
-                    #         or '记者' in author or '新闻社' in author \
+                            or '记者' in author or '新闻社' in author \
-                    #         or '”' in author\
+                            or '”' in author\
-                    #         or '大学' in author or '洛桑江村' in author:
+                            or '大学' in author or '洛桑江村' in author:
-                    #     continue
-                    if '国资委党委' in author:
-                        pass
-                    else:
                        continue
+                    # if '国资委党委' in author:
+                    #     pass
+                    # else:
+                    #     continue
                    new_href = new.find('a')['href']
-                    is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
+                    is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
                    if is_member:
                        continue
                    new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
@@ -165,7 +165,7 @@ if __name__=='__main__':
                }
                log.info(dic_news)
                if sendKafka(dic_news):
-                    r.sadd('qiushileaderspeech::' + period_title, new_href)
+                    r.sadd('qiushileaderspeech_two::' + period_title, new_href)
                    log.info(f'采集成功----{dic_news["sourceAddress"]}')
--- a/test.py
+++ b/test.py
@@ -55,56 +55,56 @@ from obs import ObsClient
 from kafka import KafkaProducer
 from base.BaseCore import BaseCore
-baseCore = BaseCore()
+# baseCore = BaseCore()
-log = baseCore.getLogger()
+# log = baseCore.getLogger()
-cnx_ = baseCore.cnx
+# cnx_ = baseCore.cnx
-cursor_ = baseCore.cursor
+# cursor_ = baseCore.cursor
+#
-def use_ocr(img):
+# def use_ocr(img):
-    ocr = ddddocr.DdddOcr()
+#     ocr = ddddocr.DdddOcr()
-    with open(img, 'rb') as f:
+#     with open(img, 'rb') as f:
-        image = f.read()
+#         image = f.read()
-    res = ocr.classification(image)
+#     res = ocr.classification(image)
-    print(res)
+#     print(res)
-    return res
+#     return res
+#
-if __name__=="__main__":
+# if __name__=="__main__":
-    requests.DEFAULT_RETRIES = 5
+#     requests.DEFAULT_RETRIES = 5
-    time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+#     time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-    log.info(f'开始时间为：{time_start}')
+#     log.info(f'开始时间为：{time_start}')
+#
-    requests.adapters.DEFAULT_RETRIES = 3
+#     requests.adapters.DEFAULT_RETRIES = 3
-    headers = {
+#     headers = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
+#         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
-    }
+#     }
+#
-    opt = webdriver.ChromeOptions()
+#     opt = webdriver.ChromeOptions()
-    opt.add_argument(
+#     opt.add_argument(
-        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
+#         'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
+#
-    opt.add_argument("--ignore-certificate-errors")
+#     opt.add_argument("--ignore-certificate-errors")
-    opt.add_argument("--ignore-ssl-errors")
+#     opt.add_argument("--ignore-ssl-errors")
-    opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+#     opt.add_experimental_option("excludeSwitches", ["enable-automation"])
-    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
+#     opt.add_experimental_option('excludeSwitches', ['enable-logging'])
-    opt.add_experimental_option('useAutomationExtension', False)
+#     opt.add_experimental_option('useAutomationExtension', False)
-    opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
+#     opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
-    chromedriver = r'D:/cmd100/chromedriver.exe'
+#     chromedriver = r'D:/cmd100/chromedriver.exe'
-    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+#     browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
-    url = "http://zxgk.court.gov.cn/shixin/"
+#     url = "http://zxgk.court.gov.cn/shixin/"
-    browser.get(url)
+#     browser.get(url)
-    # 可改动
+#     # 可改动
-    time.sleep(20)
+#     time.sleep(20)
-    page_source = browser.page_source
+#     page_source = browser.page_source
-    soup = BeautifulSoup(page_source, 'html.parser')
+#     soup = BeautifulSoup(page_source, 'html.parser')
-    img_url = soup.select('img[id="captchaImg"]')[0]['src']
+#     img_url = soup.select('img[id="captchaImg"]')[0]['src']
+#
-    browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
+#     browser.find_element(By.ID, 'pName').send_keys('北京远翰国际教育咨询有限责任公司')
+#
+#
-    browser.find_element(By.ID, 'yzm').send_keys(yzm)
+#     browser.find_element(By.ID, 'yzm').send_keys(yzm)
-    browser.find_element(By.ID, 'searchForm').click()
+#     browser.find_element(By.ID, 'searchForm').click()
-    wait = WebDriverWait(browser, 30)
+#     wait = WebDriverWait(browser, 30)
-    wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+#     wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 # screen_img_path = "D:/screen/xxx.png"
 # out_img_path = "D:/out/xxx.png"
@@ -112,3 +112,27 @@ if __name__=="__main__":
 #
 # code = use_ocr(out_img_path)
 # 验证码输入框元素.send_keys(code)
+import requests
+headers = {
+    # 'Accept': '*/*',
+    # 'Accept-Encoding': 'gzip, deflate, br',
+    # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    # 'Cache-Control': 'no-cache',
+    # 'Connection': 'keep-alive',
+    # 'Host': 'search-api-web.eastmoney.com',
+    # 'Pragma': 'no-cache',
+    # 'Sec-Fetch-Dest': 'script',
+    # 'Sec-Fetch-Mode': 'no-cors',
+    # 'Sec-Fetch-Site': 'same-site',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+    # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+    # 'sec-ch-ua-mobile': '?0',
+    # 'sec-ch-ua-platform': '"Windows"'
+}
+url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
+# res = requests.get(url).text[1:-1]
+res = requests.get(url=url, headers=headers)
+with open('./a.pdf','wb') as f:
+    f.write(res.content)
\ No newline at end of file
--- a/百度采集/baidu_comm/test.py
+++ b/百度采集/baidu_comm/test.py
+# from baiduSpider import BaiduSpider
+# from baiduSpider import BaiduSpider
+# searchkw, wordsCode, sid = '', '', ''
+# baidu = BaiduSpider(searchkw, wordsCode, sid)
+import requests
+# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
+# title = '“一带一路”商学院联盟副秘书长解奕炯：临沂在国际化物流建设中一定能“先行一步”'
+# try:
+#     detailurl = url
+#     title = title
+#     content, contentWithTag = baidu.extractorMsg(detailurl, title)
+#     contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
+# except Exception as e:
+#     content = ''
+#     contentWithTag = ''
+#
+#
+# detailmsg = {
+#     'title': title,
+#     'detailurl': url,
+#     'content': content,
+#     'contentHtml': contentWithTag,
+# }
+# print(detailmsg)
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'search-api-web.eastmoney.com',
+    'Pragma': 'no-cache',
+    'Sec-Fetch-Dest': 'script',
+    'Sec-Fetch-Mode': 'no-cors',
+    'Sec-Fetch-Site': 'same-site',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"'
+}
+url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
+# res = requests.get(url).text[1:-1]
+res = requests.get(url=url, headers=headers)
+res_json = res.text
+print(res_json)
\ No newline at end of file