提交 5122cc37 作者: 薛凌堃

2023/8/12

上级 98ca1672
......@@ -364,14 +364,14 @@ class BaseCore:
return str
# 繁体字转简体字
def hant_2_hans(hant_str: str):
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(str_num):
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
......@@ -463,6 +463,7 @@ class BaseCore:
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言
def detect_language(self, text):
......
......@@ -5,8 +5,10 @@ import time
from urllib.parse import quote
import requests
import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
......@@ -19,7 +21,7 @@ headers = {
'Accept-Encoding': 'gzip, deflate, br,'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
......@@ -35,14 +37,19 @@ def find_id_by_name(name):
time.sleep(5)
continue
time.sleep(2)
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = ''
else:
KeyNo = ''
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
......@@ -123,6 +123,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
report_type = td_list[4].text.strip()
# print(report_type)
if report_type == '年报':
if '摘要' in name_pdf:
continue
# 年份还从pdf名称里抽取
try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
......
......@@ -5,26 +5,19 @@ import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# print(updateBeginSql)
cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
cnx_.commit()
#
# def detect_language(text):
# # 使用langid.py判断文本的语言
# lang, confidence = langid.classify(text)
# print(lang,confidence)
# return lang
# detect_language("123")
from textblob import TextBlob
def detect_language(text):
blob = TextBlob(text)
lang = blob.detect_language()
return lang
text = "Hello, how are you?"
language = detect_language(text)
print(language)
'''
补充智库动态没有公众号信息数据的公众号
从库中读取信息,根据域名找到属于公众号的链接,
设置time.sleep 等待到每天执行
记录一天能采多少公众号
'''
import requests, time, re, datetime, random, json, pymysql, redis
import requests, time, random, json, pymysql, redis
import pandas as pd
import urllib3
from bs4 import BeautifulSoup
......@@ -216,7 +213,7 @@ if __name__=="__main__":
# browser2.get(url)
# browser3.get(url)
# 可改动
time.sleep(50)
time.sleep(30)
num_b = 0
browser_run = list_b[0]
log.info('======刷新浏览器=====')
......@@ -313,13 +310,13 @@ if __name__=="__main__":
count = 0
try:
ip = get_proxy()[random.randint(0, 3)]
json_search = s.get(url_search, headers=baseCore.getRandomUserAgent(), proxies=ip,
json_search = s.get(url_search, headers=headers, proxies=ip,
verify=False).json() # , proxies=ip, verify=False
time.sleep(2)
break
except:
log.info(f'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}===')
error_text = str(json_search)
# error_text = str(json_search)
json_search = ''
aa = time.sleep(600)
log.info(f'======等待时间{aa}=======')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论