提交 7fc3194e 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	tmp/usVsRussia/pravo.py
...@@ -19,12 +19,14 @@ from openpyxl import Workbook ...@@ -19,12 +19,14 @@ from openpyxl import Workbook
import langid import langid
#创建连接池 #创建连接池
import pymysql
from pymysql import connections from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
import pymysql # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('./client.conf') tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
......
...@@ -12,7 +12,7 @@ r = basecore.r ...@@ -12,7 +12,7 @@ r = basecore.r
def cnn11(): def cnn11():
#11数据库 #11数据库
cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx_.cursor() cursor_ = cnx_.cursor()
return cnx_,cursor_ return cnx_,cursor_
def close11(cnx_,cursor_): def close11(cnx_,cursor_):
...@@ -22,7 +22,7 @@ def close11(cnx_,cursor_): ...@@ -22,7 +22,7 @@ def close11(cnx_,cursor_):
# # 连接到Redis # # 连接到Redis
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6) # r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# #
# cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', # cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
# charset='utf8mb4') # charset='utf8mb4')
# cursor = cnx.cursor() # cursor = cnx.cursor()
...@@ -320,7 +320,7 @@ def FBS(): ...@@ -320,7 +320,7 @@ def FBS():
if not r.exists(item): if not r.exists(item):
# r.rpush('NewsEnterpriseFbs:gnqy_socialCode', item) # r.rpush('NewsEnterpriseFbs:gnqy_socialCode', item)
# r.rpush('CorPersonEnterpriseFbs:gnqy_socialCode', item) # r.rpush('CorPersonEnterpriseFbs:gnqy_socialCode', item)
r.rpush('NoticeEnterpriseFbs:gnqy_socialCode',item) r.rpush('AnnualEnterprise:gnshqy_socialCode',item)
# r.rpush('BaseInfoEnterpriseFbs:gnqy_social_code',item) # r.rpush('BaseInfoEnterpriseFbs:gnqy_social_code',item)
# r.rpush('FinanceFromEast:eastfinance_socialCode',item) # r.rpush('FinanceFromEast:eastfinance_socialCode',item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
......
"""
打开SEC网址——【FILINGS】——【Company Filing】——输入证券代码——选10-K和20-F为年报
"""
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://www.sec.gov/edgar/browse/?CIK=1815846&owner=exclude'
#模拟浏览器
chromedriver = "D:/chrome/chromedriver.exe"
browser = webdriver.Chrome(chromedriver)
browser.get(url)
time.sleep(3)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')
print(soup)
select_ann = soup.find_all('tr',class_='odd')
for tr in select_ann:
want_type = tr.find('td').text
if want_type=='20-F':
print('yes')
#获取原文链接
td = tr.find('td').find('a',class_='document-link')['title_href']
print(td)
import requests, re, time, pymysql import requests, re, time, pymysql
...@@ -7,7 +7,7 @@ baseCore = BaseCore.BaseCore() ...@@ -7,7 +7,7 @@ baseCore = BaseCore.BaseCore()
requests.adapters.DEFAULT_RETRIES = 3 requests.adapters.DEFAULT_RETRIES = 3
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
tracker_conf = get_tracker_conf('./client.conf') tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
...@@ -131,7 +131,8 @@ def begin(): ...@@ -131,7 +131,8 @@ def begin():
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode') # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '91100000100003962T'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
...@@ -157,7 +158,7 @@ def begin(): ...@@ -157,7 +158,7 @@ def begin():
count += 1 count += 1
runType = 'AnnualReportCount' runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count) baseCore.updateRun(social_code, runType, count)
break
if __name__ == '__main__': if __name__ == '__main__':
begin() begin()
......
...@@ -10,7 +10,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) ...@@ -10,7 +10,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl') # conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx.cursor() cursor_ = cnx.cursor()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
......
import json
import json
from kafka import KafkaProducer
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
...@@ -9,8 +12,9 @@ from base import BaseCore ...@@ -9,8 +12,9 @@ from base import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl') # conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx.cursor() cursor_ = cnx.cursor()
tracker_conf = get_tracker_conf('./client.conf') tracker_conf = get_tracker_conf('./client.conf')
...@@ -18,28 +22,6 @@ client = Fdfs_client(tracker_conf) ...@@ -18,28 +22,6 @@ client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会' taskType = '企业年报/证监会'
# def get_proxy():
# cursor = cnx_ip.cursor()
# sql = "select proxy from clb_proxy"
# cursor.execute(sql)
# proxy_lists = cursor.fetchall()
# ip_list = []
# for proxy_ in proxy_lists:
# ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
# proxy_list = []
# for str_ip in ip_list:
# str_ip_list = str_ip.split('-')
# proxyMeta = "http://%(host)s:%(port)s" % {
# "host": str_ip_list[0],
# "port": str_ip_list[1],
# }
# proxy = {
# "HTTP": proxyMeta,
# "HTTPS": proxyMeta
# }
# proxy_list.append(proxy)
# return proxy_list
def RequestUrl(url, payload, item_id, start_time): def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -118,7 +100,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -118,7 +100,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'') pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
# print(name) # print(name)
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# print(report_type) # print(report_type)
...@@ -129,11 +111,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -129,11 +111,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
try: try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '') year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
except Exception as e: except Exception as e:
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4] # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time) - 1 year = int(pub_time) - 1
year = str(year) year = str(year)
page_size = 0 # page_size = 0
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s''' sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s'''
cursor_.execute(sel_sql, (item_id, year)) cursor_.execute(sel_sql, (item_id, year))
...@@ -142,77 +124,65 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -142,77 +124,65 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
print(f'com_name:{short_name}、{year}已存在') print(f'com_name:{short_name}、{year}已存在')
continue continue
else: else:
# 类型为年报的话就解析该年报pdf,并入库 retData = baseCore.upLoadToServe(pdf_url, 1, social_code)
for i in range(0, 3): #插入数据库获取att_id
try: num = num + 1
resp_content = requests.request("GET", pdf_url).content att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
# 获取pdf页数 content = retData['content']
with fitz.open(stream=resp_content, filetype='pdf') as doc: if retData['state']:
page_size = doc.page_count pass
break
except Exception as e:
print(e)
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'==={short_name}、{year}===pdf解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
continue
result = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
except Exception as e:
print(e)
time.sleep(3)
continue
if result == '':
e = '上传服务器失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
continue
if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
type_id = '1'
item_id = dic_info['social_code']
group_name = 'group1'
path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path = bytes.decode(result['Remote file_id'])
category = 'pdf'
file_size = result['Uploaded size']
order_by = num
status = 1
create_by = 'XueLingKun'
create_time = time_now
page_size = page_size
try:
tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
category, file_size, order_by, status, create_by, create_time, page_size)
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
except:
e = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
num = num + 1
time.sleep(2)
else: else:
e = '采集失败' log.info(f'====pdf解析失败====')
return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '证监会',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': '', # 原文链接
'summary': '',
'title': name_pdf,
'type': 1,
'socialCreditCode': social_code,
'year': year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e) baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
continue print(dic_result)
return False
else: else:
continue continue
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
从数据库中读取年报缺失年份,采集对应网站上的年报,存在两种情况,标题中有年份,标题中无年份。
如果标题中有年份的话,按照原方式命名,有年份的应该都已经采过,跳过不插入更新
如果标题中无年份的话,则解析正文内容,正则表达式匹配年份,
采集一条,state 加1 如果报错的话就将state改为100,单独处理。
"""
import json
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
import requests, re, time, pymysql, fitz
from bs4 import BeautifulSoup as bs
from selenium import webdriver
chromedriver = "D:/chrome/chromedriver.exe"
browser = webdriver.Chrome(chromedriver)
from fdfs_client.client import get_tracker_conf, Fdfs_client
log = baseCore.getLogger()
requests.adapters.DEFAULT_RETRIES = 3
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
# cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji', charset='utf8mb4')
# # cnx_ip = pymysql.connect(host='114.115.159.144',user='caiji', password='zzsn9988', db='clb_project', charset='utf8mb4')
# cursor_ = cnx_.cursor()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
def clean_text(text):
"""
清理多余空行
:param text:
:return:
"""
soup = bs(text, 'html.parser')
# print(soup.get_text())
text = soup.get_text()
# str1 = re.sub('[\n]+', '\n', 'dfadf d\n \n\n \nfa ds ')
text_ = re.sub('\n+', '\n', text.replace('\t', '').replace('\r', ''))
return text_
def spider_annual_report(dict_info,num):
social_code = dict_info['social_code']
com_name = dict_info['com_name']
code = dict_info['code']
url_1 = f'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'
browser.get(url_1)
time.sleep(3)
page_source = browser.page_source
soup = bs(page_source, 'html.parser')
# res_1 = requests.get(url_1, proxies=ip)
# soup = bs(res_1.content, 'html.parser')
try:
list_all = soup.find('div', {'class': 'datelist'}).find_all('a')
except:
log.info(f'{social_code}.........年度报告列表为空')
exception = '年度报告列表为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
return
for i in list_all:
# ip = get_proxy()[random.randint(0, 3)]
pdf_name_a = i.text
year_url = 'https://vip.stock.finance.sina.com.cn' + i.get('href')
year_name = i.text
browser.get(year_url)
time.sleep(5)
page_source_2 = browser.page_source
# res_2 = requests.get(year_url, proxies=ip)
soup_2 = bs(page_source_2, 'html.parser')
try:
pdf_url = soup_2.find('th', {'style': 'text-align:center'}).find('a').get('href')
except:
#todo:无连接但是有正文内容
log.error(f'{social_code}....{year_url}....无下载链接')
exception = '无下载链接'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
#公告日期
pub_time = soup_2.find('td',{'class':'head'}).text.split('公告日期')[1]
try:
# 标题中有年份,
year = re.findall('\d{4}', year_name)[0]
if com_name != 'null':
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
except:
# 标题中无年份
content = soup_2.find('div', {'id': 'content'}).text
# 清除多余空行
content_c = clean_text(content)
for i in range(0, 4):
# 取第i行的数据
try:
line = content_c.split('\n')[i]
try:
# 正则表达式匹配年份
year_ = re.findall('\d{4}\s*年年度报告', line)[0]
year = re.findall('\d{4}', year_)[0]
if com_name != '':
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
break
except:
try:
result = soup_2.find('td', class_='head').text
year = str(int(re.findall('\d{4}', result)[0]) - 1)
if com_name != '':
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
except:
continue
except:
# result = soup_2.find('td', class_='head').text
year = str(int(re.findall('\d{4}', pub_time)[0]) - 1)
if com_name != '':
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
# name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
# name_pdf = pdf_name_a + '.pdf'
with cnx.cursor() as cursor:
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone()
if selects:
print(f'com_name:{com_name}、{year}已存在')
continue
else:
page_size = 0
#上传文件至文件服务器
retData = baseCore.upLoadToServe(pdf_url,1,social_code)
num = num + 1
try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
content = retData['content']
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '')
except:
exception = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
#发送数据到kafka
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '雪球网',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': year_url, # 原文链接
'summary': '',
'title': name_pdf,
'type': 1,
'socialCreditCode': social_code,
'year': year
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result)
return False
# num = num + 1
time.sleep(2)
# browser.quit()
#state1
if __name__ == '__main__':
num = 0
taskType = '企业年报/雪球网'
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '911100007109288314'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
code = dic_info[3]
com_name = dic_info[4]
if code is None:
exeception = '股票代码为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
while True:
if len(code) < 6:
code = "0"+code
else:
break
# years = tuple(call_year)
dict_info = {
'social_code':social_code,
'com_name':com_name,
'code':code,
}
# list_info.append(dict_info)
spider_annual_report(dict_info,num)
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
# cursor.close()
cnx_.close()
# 释放资源
baseCore.close()
...@@ -222,10 +222,10 @@ class BaseCore: ...@@ -222,10 +222,10 @@ class BaseCore:
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'] __USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.__cursor_proxy = self.__cnx_proxy.cursor() self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4') charset='utf8mb4')
self.cursor = self.cnx.cursor() self.cursor = self.cnx.cursor()
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -143,7 +143,7 @@ class YahooCaiwu(object): ...@@ -143,7 +143,7 @@ class YahooCaiwu(object):
return driver return driver
def conn11(self): def conn11(self):
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project', conn = pymysql.Connect(host='114.116.44.11', port=3306, user='caiji', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8') charset='utf8')
cursor = conn.cursor() cursor = conn.cursor()
return conn,cursor return conn,cursor
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -46,7 +46,7 @@ class Shizhi(object): ...@@ -46,7 +46,7 @@ class Shizhi(object):
return driver return driver
def conn11(self): def conn11(self):
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project', conn = pymysql.Connect(host='114.116.44.11', port=3306, user='caiji', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8') charset='utf8')
cursor = conn.cursor() cursor = conn.cursor()
return conn,cursor return conn,cursor
......
""" """
...@@ -8,7 +8,7 @@ import pandas as pd ...@@ -8,7 +8,7 @@ import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
...@@ -439,14 +439,14 @@ def getReportTime(): ...@@ -439,14 +439,14 @@ def getReportTime():
# 2023-04-01 # 2023-04-01
#todo:正式任务 #todo:正式任务
# 获取当前日期和时间 # 获取当前日期和时间
# current_date = datetime.now() current_date = datetime.now()
# 计算昨天的日期 # 计算昨天的日期
# yesterday = current_date - timedelta(days=1) yesterday = current_date - timedelta(days=1)
# 格式化昨天的日期 # 格式化昨天的日期
# report_date = yesterday.strftime('%Y-%m-%d') report_date = yesterday.strftime('%Y-%m-%d')
# list_date.append(report_date) list_date.append(report_date)
# year = int(current_date.strftime('%Y')) year = int(current_date.strftime('%Y'))
list_date = ['2023-03-31'] # list_date = ['2023-03-31']
list_month = ['-12-31', '-09-30', '-06-30', '-03-31'] list_month = ['-12-31', '-09-30', '-06-30', '-03-31']
for year in range(2022, 2018, -1): for year in range(2022, 2018, -1):
......
...@@ -20,7 +20,7 @@ class Gpdm(object): ...@@ -20,7 +20,7 @@ class Gpdm(object):
'version':'TYC-Web', 'version':'TYC-Web',
'Content-Type':'application/json;charset=UTF-8' 'Content-Type':'application/json;charset=UTF-8'
} }
cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',charset='utf8mb4') cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',charset='utf8mb4')
cursor= cnx.cursor() cursor= cnx.cursor()
taskType = '股票代码/东方财富网' taskType = '股票代码/东方财富网'
......
import requests, pymysql, re, time, json, sys
import requests, pymysql, re, time, json, sys
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures.thread import ThreadPoolExecutor
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
def InsterInto(short_name, social_code, pdf_url):
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone()
if selects:
print(f'com_name:{short_name}、{pdf_url}已存在')
return inster
# 信息插入数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
list_info = [
social_code,
pdf_url,
'东方财富网',
'1',
]
#144数据库
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
insert = True
return insert
except:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert
def gonggao_info(dic_info):
list_all_info = []
code = dic_info[3]
com_name = dic_info[4]
social__code = dic_info[2]
if 'HK' in code:
# browser.quit()
return
code1 = str(code)
while True:
if len(code1) < 6:
code1 = '0' + code1
else:
break
if code1[0] == '0' or code1[0] == '3' or code[0] == '2':
com_code = 'SZ' + code1
elif code1[0] == '6' or code1[0] == '9':
com_code = 'SH' + code1
elif code1[0] == '8' or code1[0] == '4':
com_code = 'BJ' + code1
break_id = 0
for page1 in range(1, 2):
if break_id == 1:
break
url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index={page1}&ann_type=A&client_source=web&stock_list={code1}&f_node=0&s_node=0'
for n1 in range(0, 3):
try:
res = requests.get(url, verify=False)
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(5)
continue
res_json = res.json()
list_all = res_json['data']['list']
if list_all:
for one_info in list_all:
title = one_info['title']
info_date = one_info['notice_date']
if page1 > 1 and '2022' in info_date:
break_id = 1
break
if '2021' in info_date: # 只采集22年以后的数据
break_id = 1
break
try:
info_type = one_info['columns'][0]['column_name']
except:
info_type = ''
art_code = one_info['art_code']
info_url = 'https://data.eastmoney.com/notices/detail/' + com_code + '/' + art_code + '.html'
t = int(time.time() * 1000)
json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'
for n1 in range(0, 3):
try:
json_2 = requests.get(json_url, verify=False).json()
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(5)
continue
try:
pdf_url = json_2['data']['attach_url']
except:
pdf_url = ''
#拿到pdfurl去数据库中查找,如果有该条信息 则跳过,否则继续采集
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and type='1' '''
cursor.execute(sel_sql, info_url)
selects = cursor.fetchall()
if selects:
return
else:
pass
try:
info_content = json_2['data']['notice_content']
except:
info_content = ''
list_info = [
social_code,
title,
info_content[:2000],
info_date,
info_url,
pdf_url,
'东方财富网',
'1',
'zh'
]
# list_all_info.append(tuple(list_info))
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s '''
cursor.execute(sel_sql, info_url)
selects = cursor.fetchall()
if selects:
break
else:
#todo:取消入库操作
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,publish_date,source_address,pdf_address,origin,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
else:
break
print(f'{code}:传输完成')
# list_all_info_1.append(list_all_info)
list_c.append(code)
if __name__ =='__main__':
#从redis中读取social_code'
list_c = []
list_all_info_1 = []
num = 0
taskType = '企业公告/东方财富网'
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('NoticeEnterpriseEasteFinance:gnshqy_socialCode')
# social_code = '911100007109288314'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
code = dic_info[3]
com_name = dic_info[4]
gonggao_info(dic_info)
""" """
...@@ -168,6 +168,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -168,6 +168,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
#上传至文件服务器 #上传至文件服务器
retData = baseCore.upLoadToServe(pdf_url,8,social_code) retData = baseCore.upLoadToServe(pdf_url,8,social_code)
#附件插入att数据库 #附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
num = num + 1 num = num + 1
att_id = baseCore.tableUpdate(retData,com_name,year,pdf_name,num) att_id = baseCore.tableUpdate(retData,com_name,year,pdf_name,num)
content = retData['content'] content = retData['content']
...@@ -176,27 +181,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -176,27 +181,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
return False return False
# 先获取PDF链接下载pdf,在解析内容
# try:
# res = requests.get(pdf_url)
# content = ''
# # 读取文件内容,解析内容
# with fitz.open(stream=res.content, filetype='pdf') as doc:
# for page in doc.pages():
# content += page.get_text()
# except:
# # print('解析失败')
# dic_result = {
# 'success': 'false',
# 'message': 'PDF解析失败',
# 'code': '204',
# }
# log.info(dic_result)
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
# return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -373,8 +358,8 @@ if __name__ == '__main__': ...@@ -373,8 +358,8 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode') social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
social_code = '9110000071092841XX' # social_code = '9110000071092841XX'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#解析word文件页数
def doc_page(self,file_path):
doc = Document(file_path)
return len(doc.sections)
def pdf_page(self,resp_content):
# 解析pdf文件
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
return page_size
# 替换为绝对路径之后,解析出来a.href
def uploadToserver(self,file_href,item_id):
category = os.path.splitext(file_href)[1]
# 上传至文件服务器
headers = {}
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
resp_content = ''
for i in range(0, 3):
try:
resp_content = requests.get(file_href, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
if resp_content:
pass
else:
return retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content)
self.getLogger().info('-------文件上传成功------')
break
except:
time.sleep(3)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,file_name,type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id,full_path
else:
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id)
id = selects[0]
return id,full_path
# __init__.py
__version__ = '2.2.0'
VERSION = tuple(map(int, __version__.split('.')))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: connection.py
import socket
import os
import sys
import time
import random
from itertools import chain
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# start class Connection
class Connection(object):
'''Manage TCP comunication to and from Fastdfs Server.'''
def __init__(self, **conn_kwargs):
self.pid = os.getpid()
self.host_tuple = conn_kwargs['host_tuple']
self.remote_port = conn_kwargs['port']
self.remote_addr = None
self.timeout = conn_kwargs['timeout']
self._sock = None
def __del__(self):
try:
self.disconnect()
except:
pass
def connect(self):
'''Connect to fdfs server.'''
if self._sock:
return
try:
sock = self._connect()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = sock
# print '[+] Create a connection success.'
# print '\tLocal address is %s:%s.' % self._sock.getsockname()
# print '\tRemote address is %s:%s' % (self.remote_addr, self.remote_port)
def _connect(self):
'''Create TCP socket. The host is random one of host_tuple.'''
self.remote_addr = random.choice(self.host_tuple)
# print '[+] Connecting... remote: %s:%s' % (self.remote_addr, self.remote_port)
# sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.settimeout(self.timeout)
sock = socket.create_connection((self.remote_addr, self.remote_port), self.timeout)
return sock
def disconnect(self):
'''Disconnect from fdfs server.'''
if self._sock is None:
return
try:
self._sock.close()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = None
def get_sock(self):
return self._sock
def _errormessage(self, exception):
# args for socket.error can either be (errno, "message")
# or just "message" '''
if len(exception.args) == 1:
return "[-] Error: connect to %s:%s. %s." % (self.remote_addr, self.remote_port, exception.args[0])
else:
return "[-] Error: %s connect to %s:%s. %s." % \
(exception.args[0], self.remote_addr, self.remote_port, exception.args[1])
# end class Connection
# start ConnectionPool
class ConnectionPool(object):
'''Generic Connection Pool'''
def __init__(self, name='', conn_class=Connection,
max_conn=None, **conn_kwargs):
self.pool_name = name
self.pid = os.getpid()
self.conn_class = conn_class
self.max_conn = max_conn or 2 ** 31
self.conn_kwargs = conn_kwargs
self._conns_created = 0
self._conns_available = []
self._conns_inuse = set()
# print '[+] Create a connection pool success, name: %s.' % self.pool_name
def _check_pid(self):
if self.pid != os.getpid():
self.destroy()
self.__init__(self.conn_class, self.max_conn, **self.conn_kwargs)
def make_conn(self):
'''Create a new connection.'''
if self._conns_created >= self.max_conn:
raise ConnectionError('[-] Error: Too many connections.')
num_try = 10
while True:
try:
if num_try <= 0:
sys.exit()
conn_instance = self.conn_class(**self.conn_kwargs)
conn_instance.connect()
self._conns_created += 1
break
except ConnectionError as e:
print(e)
num_try -= 1
conn_instance = None
return conn_instance
def get_connection(self):
'''Get a connection from pool.'''
self._check_pid()
try:
conn = self._conns_available.pop()
# print '[+] Get a connection from pool %s.' % self.pool_name
# print '\tLocal address is %s:%s.' % conn._sock.getsockname()
# print '\tRemote address is %s:%s' % (conn.remote_addr, conn.remote_port)
except IndexError:
conn = self.make_conn()
self._conns_inuse.add(conn)
return conn
def remove(self, conn):
'''Remove connection from pool.'''
if conn in self._conns_inuse:
self._conns_inuse.remove(conn)
self._conns_created -= 1
if conn in self._conns_available:
self._conns_available.remove(conn)
self._conns_created -= 1
def destroy(self):
'''Disconnect all connections in the pool.'''
all_conns = chain(self._conns_inuse, self._conns_available)
for conn in all_conns:
conn.disconnect()
# print '[-] Destroy connection pool %s.' % self.pool_name
def release(self, conn):
'''Release the connection back to the pool.'''
self._check_pid()
if conn.pid == self.pid:
self._conns_inuse.remove(conn)
self._conns_available.append(conn)
# print '[-] Release connection back to pool %s.' % self.pool_name
# end ConnectionPool class
def tcp_recv_response(conn, bytes_size, buffer_size=4096):
'''Receive response from server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_size: int, will be received byte_stream size
@buffer_size: int, receive buffer size
@Return: tuple,(response, received_size)
'''
recv_buff = []
total_size = 0
try:
while bytes_size > 0:
resp = conn._sock.recv(buffer_size)
recv_buff.append(resp)
total_size += len(resp)
bytes_size -= len(resp)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: (%s)' % e.args)
return (b''.join(recv_buff), total_size)
def tcp_send_data(conn, bytes_stream):
'''Send buffer to server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_stream: trasmit buffer
@Return bool
'''
try:
conn._sock.sendall(bytes_stream)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: (%s)' % e.args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_test.py
import os
import sys
import time
try:
from fdfs_client.client import *
from fdfs_client.exceptions import *
except ImportError:
import_path = os.path.abspath('../')
sys.path.append(import_path)
from fdfs_client.client import *
from fdfs_client.exceptions import *
def usage():
s = 'Usage: python fdfs_test.py {options} [{local_filename} [{remote_file_id}]]\n'
s += 'options: upfile, upbuffer, downfile, downbuffer, delete, listgroup, listserv\n'
s += ' upslavefile, upslavebuffer, upappendfile, upappendbuffer\n'
s += '\tupfile {local_filename}\n'
s += '\tupbuffer {local_filename}\n'
s += '\tdownfile {local_filename} {remote_file_id}\n'
s += '\tdownbuffer {remote_file_id}\n'
s += '\tdelete {remote_file_id}\n'
s += '\tlistgroup {group_name}\n'
s += '\tlistall \n'
s += '\tlistsrv {group_name} [storage_ip]\n'
s += '\tsetmeta {remote_file_id}\n'
s += '\tgetmeta {remote_file_id}\n'
s += '\tupslavefile {local_filename} {remote_fileid} {prefix_name}\n'
s += '\tupappendfile {local_filename}\n'
s += '\ttruncate {truncate_filesize} {remote_fileid}\n'
s += '\tmodifyfile {local_filename} {remote_fileid} {file_offset}\n'
s += '\tmodifybuffer {local_filename} {remote_fileid} {file_offset}\n'
s += 'e.g.: python fdfs_test.py upfile test'
print(s)
sys.exit(0)
if len(sys.argv) < 2:
usage()
client = Fdfs_client('client.conf')
def upfile_func():
# Upload by filename
# usage: python fdfs_test.py upfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
file_size = os.stat(local_filename).st_size
# meta_buffer can be null.
meta_dict = {
'ext_name': 'py',
'file_size': str(file_size) + 'B'
}
t1 = time.time()
ret_dict = client.upload_by_filename(local_filename, meta_dict)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upfileex_func():
# Upload by file
# usage: python fdfs_test.py upfileex {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
t1 = time.time()
ret_dict = client.upload_by_file(local_filename)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavefile_func():
# upload slave file
# usage: python fdfs_test.py upslavefile {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
ret_dict = client.upload_slave_by_file(local_filename, remote_fileid, \
prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavebuffer_func():
# upload slave by buffer
# usage: python fdfs_test.py upslavebuffer {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.upload_slave_by_buffer(local_filename, \
remote_fileid, prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def del_func():
# delete file
# usage: python fdfs_test.py delete {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
remote_file_id = sys.argv[2]
ret_tuple = client.delete_file(remote_file_id)
print('[+] %s' % ret_tuple[0])
print('[+] remote_fileid: %s' % ret_tuple[1])
print('[+] Storage IP: %s' % ret_tuple[2])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downfile_func():
# Download to file
# usage: python fdfs_test.py downfile {local_filename} {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
ret_dict = client.download_to_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_group_func():
# List one group info
# usage: python fdfs_test.py listgroup {group_name}
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
ret = client.list_one_group(group_name)
print(ret)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def listall_func():
# List all group info
# usage: python fdfs_test.py listall
if len(sys.argv) < 2:
usage()
return None
try:
ret_dict = client.list_all_groups()
print('=' * 80)
print('Groups count:', ret_dict['Groups count'])
for li in ret_dict['Groups']:
print('-' * 80)
print(li)
print('-' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_server_func():
# List all servers info of group
# usage: python fdfs_test.py listsrv {group_name} [storage_ip]
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
if len(sys.argv) > 3:
storage_ip = sys.argv[3]
else:
storage_ip = None
ret_dict = client.list_servers(group_name, storage_ip)
print('=' * 80)
print('Group name: %s' % ret_dict['Group name'])
print('=' * 80)
i = 1
for serv in ret_dict['Servers']:
print('Storage server %d:' % i)
print('=' * 80)
print(serv)
i += 1
print('=' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upbuffer_func():
# Upload by buffer
# usage: python fdfs_test.py upbuffer {local_filename} [remote_file_ext_name]
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
if len(sys.argv) > 3:
ext_name = sys.argv[3]
else:
ext_name = None
# meta_buffer can be null.
meta_buffer = {
'ext_name': 'gif',
'width': '150px',
'height': '80px'
}
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_by_buffer(file_buffer, ext_name, meta_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downbuffer_func():
# Download to buffer
# usage: python fdfs_test.py downbuffer {remote_file_id}
# e.g.: 'group1/M00/00/00/wKjzhU_rLNmjo2-1AAAamGDONEA5818.py'
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.download_to_buffer(remote_fileid)
print('Downloaded content:')
print(ret_dict['Content'])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def get_meta_data_func():
# Get meta data of remote file
# usage python fdfs_test.py getmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.get_meta_data(remote_fileid)
print(ret_dict)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def set_meta_data_func():
# Set meta data of remote file
# usage python fdfs_test.py setmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
meta_dict = {
'ext_name': 'jgp',
'width': '160px',
'hight': '80px',
}
try:
ret_dict = client.set_meta_data(remote_fileid, meta_dict)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendfile_func():
# Upload an appender file by filename
# usage: python fdfs_test.py upappendfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
ret_dict = client.upload_appender_by_file(local_filename)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendbuffer_func():
# Upload an appender file by buffer
# usage: python fdfs_test.py upappendbuffer {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_appender_by_buffer(file_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendfile_func():
# Append a remote file
# usage: python fdfs_test.py appendfile {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
ret_dict = client.append_by_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendbuffer_func():
# Append a remote file by buffer
# usage: python fdfs_test.py appendbuffer {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.append_by_buffer(filebuffer, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def truncate_func():
# Truncate file
# usage: python fdfs_test.py truncate {truncate_filesize} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
truncate_filesize = int(sys.argv[2])
remote_fileid = sys.argv[3]
try:
ret_dict = client.truncate_file(truncate_filesize, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifyfile_func():
# Modify file by filename
# usage: python fdfs_test.py modifyfile {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
ret_dict = client.modify_by_filename(local_filename, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifybuffer_func():
# Modify file by buffer
# usage: python fdfs_test.py modifybuffer {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.modify_by_buffer(filebuffer, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
result = {
'upfile': lambda: upfile_func(),
'upfileex': lambda: upfileex_func(),
'upbuffer': lambda: upbuffer_func(),
'delete': lambda: del_func(),
'downfile': lambda: downfile_func(),
'downbuffer': lambda: downbuffer_func(),
'listgroup': lambda: list_group_func(),
'listall': lambda: listall_func(),
'listsrv': lambda: list_server_func(),
'getmeta': lambda: get_meta_data_func(),
'setmeta': lambda: set_meta_data_func(),
'upslavefile': lambda: upslavefile_func(),
'upappendfile': lambda: upappendfile_func(),
'upappendbuffer': lambda: upappendbuffer_func(),
'appendfile': lambda: appendfile_func(),
'appendbuffer': lambda: appendbuffer_func(),
'truncate': lambda: truncate_func(),
'modifyfile': lambda: modifyfile_func(),
'modifybuffer': lambda: modifybuffer_func(),
'-h': lambda: usage(),
}[sys.argv[1].lower()]()
...@@ -22,7 +22,7 @@ headers = { ...@@ -22,7 +22,7 @@ headers = {
'version':'TYC-Web', 'version':'TYC-Web',
'Content-Type':'application/json;charset=UTF-8' 'Content-Type':'application/json;charset=UTF-8'
} }
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor= cnx.cursor() cursor= cnx.cursor()
#根据信用代码获取天眼查id 企业名字等信息 #根据信用代码获取天眼查id 企业名字等信息
......
...@@ -12,7 +12,7 @@ jieba.cut("必须加载jieba") ...@@ -12,7 +12,7 @@ jieba.cut("必须加载jieba")
smart =smart_extractor.SmartExtractor('cn') smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor= cnx.cursor() cursor= cnx.cursor()
pageSize = 10 pageSize = 10
headers = { headers = {
......
import json
import time
import requests
from pymysql.converters import escape_string
from selenium import webdriver
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def flushAndGetToken(browser):
log.info('======刷新浏览器=====')
browser.refresh()
cookie_list = browser.get_cookies()
cur_url = browser.current_url
token = cur_url.split('token=')[1]
log.info(f'===========当前token为:{token}============')
cookies = {}
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
browser.get(cur_url)
info = browser.page_source
# res_2 = requests.get(year_url, proxies=ip)
soup = BeautifulSoup(info, 'html.parser')
user_name = soup.find('div', class_='weui-desktop_name').text
return token,cookies,user_name
if __name__=="__main__":
requests.DEFAULT_RETRIES = 5
time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'开始时间为:{time_start}')
requests.adapters.DEFAULT_RETRIES = 3
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
# chromedriver = r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
chromedriver = r'D:/chrome/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(70)
s = requests.session()
#获取到token和cookies
token, cookies,user_name = flushAndGetToken(browser)
print(token,cookies)
cookies = json.dumps(cookies)
# loadinfo = [token,cookies]
#保存到数据库中
insert = f"insert into weixin_tokenCookies (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
cursor_.execute(insert)
cnx_.commit()
baseCore.close()
# s.cookies.update(cookies)
# s.keep_alive = False
...@@ -20,13 +20,13 @@ baseCore = BaseCore() ...@@ -20,13 +20,13 @@ baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
cnx = pymysql.connect(host="114.116.44.11", user="root", password="f7s0&7qqtK", db="clb_project", charset="utf8mb4") cnx = pymysql.connect(host="114.116.44.11", user="caiji", password="f7s0&7qqtK", db="clb_project", charset="utf8mb4")
cursor = cnx.cursor() cursor = cnx.cursor()
r = baseCore.r r = baseCore.r
urllib3.disable_warnings() urllib3.disable_warnings()
def check_url(sid, article_url): def check_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn') # r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sismember(f'wx_url_{sid}',article_url) res = r.sismember(f'wx_url_{sid}',article_url)
if res == True: if res == True:
return True return True
...@@ -34,7 +34,7 @@ def check_url(sid, article_url): ...@@ -34,7 +34,7 @@ def check_url(sid, article_url):
return False return False
def add_url(sid, article_url): def add_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn') # r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sadd(f'wx_url_{sid}', article_url, 3) # 注意是 保存set的方式 res = r.sadd(f'wx_url_{sid}', article_url, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复 if res == 0: # 若返回0,说明插入不成功,表示有重复
return True return True
...@@ -88,10 +88,10 @@ def get_info(sid,json_search,origin,url_,info_source_code,page): ...@@ -88,10 +88,10 @@ def get_info(sid,json_search,origin,url_,info_source_code,page):
url_news = one_news['link'] url_news = one_news['link']
url_ft = check_url(sid, url_news) # url_ft = check_url(sid, url_news)
if url_ft: # if url_ft:
log.info(f'-----{origin}--第{page}页--已采过该篇文章--文章链接--{url_news}-----') # log.info(f'-----{origin}--第{page}页--已采过该篇文章--文章链接--{url_news}-----')
return list_all_info,num_caiji # return list_all_info,num_caiji
try: try:
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
res_news = requests.get(url_news, timeout=20,proxies=ip) res_news = requests.get(url_news, timeout=20,proxies=ip)
...@@ -176,43 +176,46 @@ def get_info(sid,json_search,origin,url_,info_source_code,page): ...@@ -176,43 +176,46 @@ def get_info(sid,json_search,origin,url_,info_source_code,page):
'source': '11', 'source': '11',
'createDate': time_now 'createDate': time_now
} }
for nnn in range(0, 3): # for nnn in range(0, 3):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) # producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try: # try:
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8')) # kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_time_out = kafka_result.get(timeout=10) # kafka_time_out = kafka_result.get(timeout=10)
add_url(sid, url_news) # add_url(sid, url_news)
break # break
except: # except:
time.sleep(5) # time.sleep(5)
continue # continue
finally: # finally:
producer.close() # producer.close()
num_caiji = num_caiji + 1 num_caiji = num_caiji + 1
list_all_info.append(dic_info) list_all_info.append(dic_info)
time.sleep(5) time.sleep(5)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info2 = { # dic_info2 = {
'infoSourceId': sid, # 'infoSourceId': sid,
'code': info_source_code, # 'code': info_source_code,
'num': num_caiji, # 'num': num_caiji,
'collectTime': kaishi_time, # 'collectTime': kaishi_time,
'dispatcherTime': time_now, # 'dispatcherTime': time_now,
'dispatcherStatus': '1', # 'dispatcherStatus': '1',
'source': '1', # 'source': '1',
} # }
for nnn2 in range(0, 3): # for nnn2 in range(0, 3):
try: # producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) # try:
kafka_result2 = producer2.send("collectionAndDispatcherInfo", # # producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
json.dumps(dic_info2, ensure_ascii=False).encode('utf8')) # kafka_result2 = producer2.send("collectionAndDispatcherInfo",
break # json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
except: # break
time.sleep(5) # except:
continue # time.sleep(5)
# continue
# finally:
# producer2.close()
return list_all_info,num_caiji return list_all_info,num_caiji
def RequestUrl(dic_url,token,key): def RequestUrl(dic_url,token,key,i):
start_ = time.time() start_ = time.time()
url_ = dic_url['url_'] url_ = dic_url['url_']
origin = dic_url['name'] origin = dic_url['name']
...@@ -220,14 +223,13 @@ def RequestUrl(dic_url,token,key): ...@@ -220,14 +223,13 @@ def RequestUrl(dic_url,token,key):
sid = dic_url['sid'] sid = dic_url['sid']
biz = dic_url['biz'] biz = dic_url['biz']
fakeid = biz + '==' fakeid = biz + '=='
url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1' url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
ret = -1 ret = -1
json_search = '' json_search = ''
# 获取页数 # 获取页数
try: try:
# ip = baseCore.get_proxy() ip = baseCore.get_proxy()
json_search = s.get(url_search, headers=headers, json_search = s.get(url_search, headers=headers,proxies=ip, verify=False).json() # , proxies=ip, verify=False
verify=False).json() # , proxies=ip, verify=False
str_t = json.dumps(json_search) str_t = json.dumps(json_search)
time.sleep(1) time.sleep(1)
except Exception as e: except Exception as e:
...@@ -243,7 +245,7 @@ def RequestUrl(dic_url,token,key): ...@@ -243,7 +245,7 @@ def RequestUrl(dic_url,token,key):
# {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}} 公众号biz错误 链接 # {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}} 公众号biz错误 链接
# 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常 # 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
if ret == 0: if ret == 0:
pass return json_search,ret
elif ret == 200013: elif ret == 200013:
# 重新放入redis # 重新放入redis
# time.sleep(3600) # time.sleep(3600)
...@@ -315,17 +317,17 @@ def job(count,key): ...@@ -315,17 +317,17 @@ def job(count,key):
log.info('===========获取公众号============') log.info('===========获取公众号============')
start_ = time.time() start_ = time.time()
#todo:redis中数据 pop一条 #todo:redis中数据 pop一条
infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode') # infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
if infoSourceCode == 'None' or infoSourceCode == None: # if infoSourceCode == 'None' or infoSourceCode == None:
#当一次采集完之后,重新插入数据并等待插入完成 # #当一次采集完之后,重新插入数据并等待插入完成
getFromSql() # getFromSql()
time.sleep(20) # time.sleep(20)
log.info(f'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}') # log.info(f'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}')
return count # return count
sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' " # sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛' # '一带一路百人论坛'
# sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436' " sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20230630-0010' "
cursor.execute(sql) cursor.execute(sql)
row = cursor.fetchone() row = cursor.fetchone()
...@@ -362,7 +364,8 @@ def job(count,key): ...@@ -362,7 +364,8 @@ def job(count,key):
cursor_.execute(insertSql, tuple(error)) cursor_.execute(insertSql, tuple(error))
cnx_.commit() cnx_.commit()
return count return count
json_search,ret = RequestUrl(dic_url,token,key) i = 0
json_search,ret = RequestUrl(dic_url,token,key,i)
if ret == 0: if ret == 0:
try: try:
Max_data = int(json_search['app_msg_cnt']) Max_data = int(json_search['app_msg_cnt'])
...@@ -376,7 +379,7 @@ def job(count,key): ...@@ -376,7 +379,7 @@ def job(count,key):
Max_data = 5 Max_data = 5
log.info(f'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----') log.info(f'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----')
for i in range(0, Max_data, 5): for i in range(0, Max_data, 5):
json_search,ret = RequestUrl(dic_url,token,key) json_search,ret = RequestUrl(dic_url,token,key,i)
if ret == 0: if ret == 0:
pass pass
else: else:
......
# -*- coding: utf-8 -*-
'''
成功100 发送数据失败200 请求失败400 文章内容为空500
'''
import requests, time, random, json, pymysql, redis
import pandas as pd
import urllib3
from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
from obs import ObsClient
from kafka import KafkaProducer
# logging.basicConfig(filename='example.log', level=logging.INFO)
from base.BaseCore import BaseCore
import os
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
# cnx = pymysql.connect(host="114.116.44.11", user="root", password="f7s0&7qqtK", db="clb_project", charset="utf8mb4")
# cursor = cnx.cursor()
r = baseCore.r
urllib3.disable_warnings()
def check_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sismember(f'wx_url_{sid}',article_url)
if res == 1:
return True
else:
return False
def add_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sadd(f'wx_url_{sid}', article_url, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# #定时
# def getFromSql():
# selectSql = "SELECT info_source_code from info_source where site_uri like '%mp.weixin.qq.com%'"
# cursor.execute(selectSql)
# results = cursor.fetchall()
# result_list = [item[0] for item in results]
#
# #放入redis
# for item in result_list:
# r.rpush('WeiXinGZH:infoSourceCode', item)
#
# #刷新浏览器并获得token
# def flushAndGetToken(list_b):
# browser_run = list_b[0]
# log.info('======刷新浏览器=====')
# browser_run.refresh()
# cookie_list = browser_run.get_cookies()
# cur_url = browser_run.current_url
# token = cur_url.split('token=')[1]
# log.info(f'===========当前token为:{token}============')
# cookies = {}
# for cookie in cookie_list:
# cookies[cookie['name']] = cookie['value']
# return token,cookies
#采集失败的公众号 重新放入redis
def rePutIntoR(item):
r.rpush('WeiXinGZH:infoSourceCode', item)
def updatewxLink(link,info_source_code,state):
updateSuccess = f"update wx_link set state= {state} where link='{link}' and info_source_code='{info_source_code}' "
cursor_.execute(updateSuccess)
cnx_.commit()
def getjsonInfo():
#从数据库中获取信息 一条
select_sql = "select * from wx_link where state=0 order by id asc limit 1"
cursor_.execute(select_sql)
row = cursor_.fetchone()
if row:
pass
else:
log.info('-----没有数据了-----')
return False
dict_json = {
'sid':row[1],
'site_uri':row[2],
'site_name':row[3],
'info_source_code':row[4],
'title':row[5],
'publish_time':row[6],
'link':row[7]
}
# 拿到一条数据 更新状态
update_sql = f"update wx_link set state=1 where link='{row[7]}' and info_source_code='{row[4]}' "
cursor_.execute(update_sql)
cnx_.commit()
return dict_json
def get_info(dict_json):
# list_all_info = []
# num_caiji = 0
kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
news_title = dict_json['title']
sid = dict_json['sid']
news_date = dict_json['publish_time']
origin = dict_json['site_name']
url_news = dict_json['link']
info_source_code = dict_json['info_source_code']
# url_ft = check_url(sid, url_news)
# if url_ft:
# return list_all_info,num_caiji
try:
ip = baseCore.get_proxy()
res_news = requests.get(url_news, proxies=ip,timeout=20)
except:
#400请求失败
updatewxLink(url_news,info_source_code,400)
return False
soup_news = BeautifulSoup(res_news.content, 'html.parser')
news_html = soup_news.find('div', {'id': 'js_content'})
try:
del news_html['style']
del news_html['id']
del news_html['class']
except:
pass
try:
news_content = news_html.text
except:
log.info(f'--------内容为空--------{url_news}--------')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
false = [
news_title,
url_news,
news_html,
'文章内容为空',
time_now
]
insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
cursor_.execute(insertSql, tuple(false))
cnx_.commit()
updatewxLink(url_news,info_source_code,500)
return False
list_img = news_html.find_all('img')
for num_img in range(len(list_img)):
img_one = list_img[num_img]
url_src = img_one.get('data-src')
# print(url_src)
if 'gif' in url_src:
url_img = ''
img_one.extract()
else:
try:
name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
except:
img_one.extract()
continue
try:
res = requests.get(url_src, timeout=20)
except:
img_one.extract()
resp = obsClient.putContent('zzsn', name_img, content=res.content)
url_img = resp['body']['objectUrl']
str_url_img = f'<img src="{url_img}">'
try:
img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
except Exception as e:
log.info(f'----{url_news}-----------{e}')
return False
for tag in news_html.descendants:
try:
del tag['style']
except:
pass
list_section = news_html.find_all('section')
for section in list_section:
section.name = 'div'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'sid': sid,
'title': news_title,
'content': news_content,
'contentWithtag': str(news_html),
'summary': '',
'author': '',
'origin': origin,
'publishDate': news_date,
'sourceAddress': url_news,
'source': '11',
'createDate': time_now
}
for nnn in range(0, 3):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_time_out = kafka_result.get(timeout=10)
# add_url(sid, url_news)
break
except:
time.sleep(5)
log.info('------数据发送kafka失败------')
updatewxLink(url_news,info_source_code,200)
continue
list_all_info.append(dic_info)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info2 = {
'infoSourceId': sid,
'code': info_source_code,
'num': num_caiji,
'collectTime': kaishi_time,
'dispatcherTime': time_now,
'dispatcherStatus': '1',
'source': '1',
}
for nnn2 in range(0, 3):
try:
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result2 = producer2.send("collectionAndDispatcherInfo",
json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
break
except:
time.sleep(5)
continue
updatewxLink(url_news,info_source_code,100)
return True
if __name__=="__main__":
num_caiji = 0
list_all_info = []
while True:
#一次拿取一篇文章
dict_json =getjsonInfo()
if dict_json:
if get_info(dict_json):
num_caiji = num_caiji + 1
log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
else:
break
baseCore.close()
\ No newline at end of file
# 微信采集列表数据
import json
import time
import random
import pymysql
import requests
import urllib3
from pymysql.converters import escape_string
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
}
s = requests.session()
cnx = pymysql.connect(host="114.116.44.11", user="caiji", password="f7s0&7qqtK", db="clb_project", charset="utf8mb4")
cursor = cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
r = baseCore.r
def resHtml(token,url,cookies):
try:
ip = baseCore.get_proxy()
s=requests.session()
cookie_jar = requests.utils.cookiejar_from_dict(cookies, cookiejar=None, overwrite=True)
s.cookies = cookie_jar
# json_search = s.get(url, headers=headers, proxies=ip, verify=False).json()
json_search = s.get(url, headers=headers, proxies=ip,verify=False).json()
aa=s.cookies.get_dict()
updateCookieToken(token, json.dumps(aa))
except Exception as e:
json_search= {}
return json_search
#采集失败的公众号 重新放入redis
def rePutIntoR(item):
r.rpush('WeiXinGZH:infoSourceCode', item)
#获取公众号信息
def getSourceInfo(infoSourceCode):
sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
cursor.execute(sql)
row = cursor.fetchone()
dic_url = {
'url_': row[0],
'sid': row[1],
'name': row[2],
'info_source_code': row[3],
'biz': ''
}
url_ = dic_url['url_']
origin = dic_url['name']
info_source_code = dic_url['info_source_code']
sid = dic_url['sid']
try:
biz = url_.split('__biz=')[1].split('==&')[0].split('=')[0]
dic_url['biz'] = biz
except Exception as e:
log.info(f'---公众号--{origin}---biz错误')
error = [
origin,
url_,
info_source_code,
e,
'biz错误'
]
insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,now())"
cursor_.execute(insertSql, tuple(error))
cnx_.commit()
return False
return dic_url
#保存错误日志
def insertBadSql(error):
insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,now())"
cursor_.execute(insertSql, tuple(error))
cnx_.commit()
#保存文章列表数据
def insertWxList(dic_url,json_search,page):
list_all_news = json_search['app_msg_list']
listCount=0
repetCount=0
insertCount=0
for one_news in list_all_news:
listCount=listCount+1
news_title = one_news['title']
timestamp = one_news['create_time']
time_local = time.localtime(timestamp)
news_date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
url_news = one_news['link']
selectCountSql=f"select count(1) from wx_link where link='{escape_string(url_news)}'"
cursor_.execute(selectCountSql)
count = cursor_.fetchone()[0]
if count > 0:
repetCount=repetCount+1
continue
else:
insertCount=insertCount+1
try:
insertSql=f"insert into wx_link(sid,site_uri,site_name,info_source_code,title,publish_time,link,state,create_time) values " \
f"('{dic_url['sid']}','{dic_url['url_']}','{dic_url['name']}','{dic_url['info_source_code']}','{escape_string(news_title)}','{escape_string(news_date)}','{escape_string(url_news)}',0,now())"
cursor_.execute(insertSql)
cnx_.commit()
except Exception as e:
log.error(f"保存数据库失败:{e}")
log.info(f"---{dic_url['name']}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount==0:
#列表为空认为结束
return True
if repetCount>= listCount/2:
#重复数量大于等于一半认为结束
return True
#没有结束
return False
#token的处理
def updateTokeen(token,type):
if type==2:
#session失效,删除token
cursor_.execute(f"delete from weixin_tokenCookies where token={token}")
if type ==1:
#封号了 修改封号时间
cursor_.execute(f"update weixin_tokenCookies set fenghao_time=now() where token={token}")
if type ==3:
#封号了 修改封号时间
cursor_.execute(f"update weixin_tokenCookies set update_time=now() where token={token}")
cnx_.commit()
#token的处理
def updateCookieToken(token,cookies):
cursor_.execute(f"update weixin_tokenCookies set cookies='{escape_string(cookies)}' where token={token}")
cnx_.commit()
#获取token
def getToken():
cursor_.execute(f"select token,cookies from weixin_tokenCookies where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
row = cursor_.fetchall()
if row:
pass
else:
#没有查到token
return False
return row[0]
#获取列表数据
def getPageData(dic_url,page):
url_ = dic_url['url_']
origin = dic_url['name']
info_source_code = dic_url['info_source_code']
biz = dic_url['biz']
fakeid = biz + '=='
tokenAndCookie = getToken()
if tokenAndCookie:
pass
else:
while True:
time.sleep(60)
tokenAndCookie = getToken()
if tokenAndCookie:
break
token = tokenAndCookie[0]
log.info(f"获取token到----{token}")
cookies = json.loads(tokenAndCookie[1])
# s.cookies.update(cookies)
url = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={(page - 1) * 5}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# reponse = s.get(url, headers=headers, proxies=ip, verify=False)
# json_search = reponse.json()
# newcookies = requests.utils.dict_from_cookiejar(reponse.cookies, cookiejar=None, overwrite=True)
# s.cookies = newcookies
# updateCookieToken(token,json.dumps(s.cookies))
#调用方法
json_search=resHtml(token,url,cookies)
str_t = json.dumps(json_search)
ret = json_search['base_resp']['ret']
if ret == 0:
pass
elif ret == 200013:
log.info(f'======{origin}-----{biz}----该账号被封=======')
#封号修改token
updateTokeen(token,1)
return getPageData(dic_url,page)
elif ret == 200002:
log.info(f'======{origin}-----{biz}----该账号biz错误,请检查=======')
error = [origin, url_, info_source_code, str_t, '无效biz参数']
insertBadSql(error)
return True
elif ret == 200003:
log.info(f'======{origin}-----{biz}----该账号无效session=======')
# session失效修改token
updateTokeen(token, 2)
error = [origin, url_, info_source_code, str_t, '无效session']
insertBadSql(error)
return getPageData(dic_url, page)
else:
log.info(f'======{origin}-----{biz}----该账号其他错误=======')
error = [origin, url_, info_source_code, str_t, '其他错误']
insertBadSql(error)
return True
# 修改token使用时间
updateTokeen(token, 3)
# 保存数据到数据库
return insertWxList(dic_url,json_search,page)
#获取微信公众号数据
def getWxList(infoSourceCode):
dic_url = getSourceInfo(infoSourceCode)
log.info(f"======{infoSourceCode}----开始采集=======")
if dic_url:
pass
else:
log.info(f'======{infoSourceCode}---------该账号biz错误,请检查=======')
error = ['', '', infoSourceCode, '', '该账号biz错误']
insertBadSql(error)
return
origin = dic_url['name']
biz = dic_url['biz']
for page in range(1,2):
retFlag = getPageData(dic_url, page)
time.sleep(random.randint(60,181))
if retFlag:
#结束 跳出该公众号
break
else:
#没有结束
pass
log.info(f"======{origin}-----{biz}----结束采集=======")
def getFromSql():
selectSql = "SELECT info_source_code from info_source where site_uri like '%mp.weixin.qq.com%'"
cursor.execute(selectSql)
results = cursor.fetchall()
result_list = [item[0] for item in results]
#放入redis
for item in result_list:
r.rpush('WeiXinGZH:infoSourceCode', item)
if __name__=="__main__":
while True:
infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
if infoSourceCode == 'None' or infoSourceCode == None:
log.info("redis已经没有数据了,重新放置数据")
getFromSql()
time.sleep(10)
infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
getWxList(infoSourceCode)
# infoSourceCode = 'IN-20220917-0159'
# getWxList(infoSourceCode)
...@@ -88,7 +88,7 @@ chrome_options.add_argument('--headless') ...@@ -88,7 +88,7 @@ chrome_options.add_argument('--headless')
executable_path = r'D:\chrome\chromedriver.exe' executable_path = r'D:\chrome\chromedriver.exe'
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path) driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
def scroll(driver): def scroll(driver):
for i in range(0,30): for i in range(0,30):
......
雅虎财经 国外上市企业信息采集 雅虎财经企业动态 部署到香港服务器上
# 雅虎财经企业动态获取 # -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
import json import json
import time import time
...@@ -62,7 +63,7 @@ def getZx(xydm, url, title, cnx, path): ...@@ -62,7 +63,7 @@ def getZx(xydm, url, title, cnx, path):
'雅虎财经', '雅虎财经',
author, author,
'2', '2',
'zh' 'en'
] ]
try: try:
...@@ -180,15 +181,15 @@ def scroll(xydm,name,gpdm): ...@@ -180,15 +181,15 @@ def scroll(xydm,name,gpdm):
break break
last_url_ = last_url last_url_ = last_url
#采集失败的公众号 重新放入redis #采集失败的企业 重新放入redis
def rePutIntoR(item): def rePutIntoR(item):
r.rpush('NewsEnterprise:gwqy_socialCode', item) r.rpush('NewsEnterprise:gwqy_socialCode', item)
if __name__ == "__main__": if __name__ == "__main__":
path = r'F:\spider\115\chromedriver.exe' path = r'D:\chrome\chromedriver.exe'
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
while True: while True:
...@@ -197,9 +198,10 @@ if __name__ == "__main__": ...@@ -197,9 +198,10 @@ if __name__ == "__main__":
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if not social_code : if not social_code :
log.info('============已没有数据============等待===============')
time.sleep(20) time.sleep(20)
continue continue
if social_code == 'None': if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
......
...@@ -214,7 +214,7 @@ class BaseCore: ...@@ -214,7 +214,7 @@ class BaseCore:
except : except :
pass pass
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor() self.__cursor_proxy= self.__cnx_proxy.cursor()
pass pass
......
...@@ -5,7 +5,7 @@ pass=clbzzsn ...@@ -5,7 +5,7 @@ pass=clbzzsn
[mysql] [mysql]
host=114.115.159.144 host=114.115.159.144
username=root username=caiji
password=zzsn9988 password=zzsn9988
database=caiji database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
......
...@@ -215,7 +215,7 @@ class BaseCore: ...@@ -215,7 +215,7 @@ class BaseCore:
except : except :
pass pass
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor() self.__cursor_proxy= self.__cnx_proxy.cursor()
pass pass
......
...@@ -5,7 +5,7 @@ pass=clbzzsn ...@@ -5,7 +5,7 @@ pass=clbzzsn
[mysql] [mysql]
host=114.115.159.144 host=114.115.159.144
username=root username=caiji
password=zzsn9988 password=zzsn9988
database=caiji database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
......
import os
import pandas as pd import pandas as pd
import pymysql import pymysql
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pymysql.converters import escape_string from pymysql.converters import escape_string
from selenium.webdriver.common.by import By import downPdf
from BaseCore import BaseCore
from base.BaseCore import BaseCore from datetime import datetime
baseCore = BaseCore() baseCore = BaseCore()
log =baseCore.getLogger() log =baseCore.getLogger()
headers = { headers = {
...@@ -28,20 +29,72 @@ headers = { ...@@ -28,20 +29,72 @@ headers = {
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
def downFile(url,path,pdf_name):
try:
baseCore.mkPath(path)
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, headers=headers, verify=False, timeout=10)
# response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
pdf_name = pdf_name +'.pdf'
with open(os.path.join(path, pdf_name), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
pyFile.write(chunk)
except Exception as e:
log.error(f"出错了----------{e}")
return False
return pdf_name
def job_2(): def job_2():
log.info('----开始采集---俄罗斯国家杂志----') log.info('----开始采集---俄罗斯国家杂志----')
path = r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe' # path = 'D:chrome/chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False) # driverContent = baseCore.buildDriver(path, headless=False)
url = 'http://publication.pravo.gov.ru/documents/block/president' for i in range(68,200):
req = requests.get(url,headers) if i == 1:
soup = BeautifulSoup(req.content,'html.parser') url = 'http://publication.pravo.gov.ru/documents/block/president'
container = soup.find('div',class_='documents-container') else:
web_list = container.find_all('div',class_='documents-table-row') url = f'http://publication.pravo.gov.ru/documents/block/president?index={i}&pageSize=30'
for web in web_list[:1]: req = requests.get(url,headers)
web_href = web.find('a')['href'] soup = BeautifulSoup(req.content,'html.parser')
web_url = 'http://publication.pravo.gov.ru/' + web_href container = soup.find('div',class_='documents-container')
title = web.find('a').text web_list = container.find_all('div',class_='documents-table-row')
print(title) for web in web_list:
title = web.find_all('a')[1].text
if '"О' in title:
pdftitle = title.strip().split('"О')[0]
if '-рп' in title:
pdftitle = title.strip().split('-рп')[0] + '-рп'
pdfUrl = 'http://publication.pravo.gov.ru' + web.find('div',class_='notforprint pt-2').find('a')['href']
# pdfTitle = aa.find('a')['title']
print(pdfUrl)
selectCountSql = f"select * from usvsrussia where url = '{pdfUrl}' "
cursor.execute(selectCountSql)
url = cursor.fetchone()
if url:
log.info("已采集,跳过")
continue
else:
pass
date_string = web.find('div',class_='infoindocumentlist').find_all('div')[1].find('span',class_='info-data').text
#时间格式转化
date_object = datetime.strptime(date_string, "%d.%m.%Y")
pub_time = date_object.strftime("%Y-%m-%d")
print(pub_time)
pdf_name = web.find('div',class_='infoindocumentlist').find_all('div')[0].find('span',class_='info-data').text
#下载pdf
path=r'D:\美国VS俄罗斯制裁'
path = os.path.join(path, downPdf.getPath(pdftitle))
downFile(pdfUrl,path,pdf_name)
insertSql = f"insert into usvsrussia (website,url,title,pub_time,state,pdf_name,pdf_path,create_time) values ('总统令文件','{pdfUrl}','{escape_string(pdftitle)}','{pub_time}',0,'{pdf_name}','{path}',now() )"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# break
job_2()
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -215,7 +215,7 @@ class BaseCore: ...@@ -215,7 +215,7 @@ class BaseCore:
except : except :
pass pass
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor() self.__cursor_proxy= self.__cnx_proxy.cursor()
pass pass
......
[redis] [redis]
...@@ -5,7 +5,7 @@ pass=clbzzsn ...@@ -5,7 +5,7 @@ pass=clbzzsn
[mysql] [mysql]
host=114.115.159.144 host=114.115.159.144
username=root username=caiji
password=zzsn9988 password=zzsn9988
database=caiji database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论