提交 31e25a8d 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -650,7 +650,7 @@ class BaseCore:
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
def tableUpdate(self,retData,com_name,year,pdf_name,num,pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
......@@ -670,12 +670,12 @@ class BaseCore:
id = ''
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s.%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
create_time, page_size,pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
......@@ -759,7 +759,7 @@ class BaseCore:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
......
import json
import json
......@@ -133,7 +133,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
return False
#插入数据库获取att_id
num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time)
if att_id:
pass
else:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -164,7 +164,7 @@ def spider_annual_report(dict_info,num):
return False
num = num + 1
try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time)
content = retData['content']
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
......
......@@ -174,7 +174,7 @@ def spider(com_name,cik,up_okCount):
'summary': '',
'title': title,
'type': 1,
'socialCreditCode': '',
'socialCreditCode': social_code,
'year': year
}
# print(dic_news)
......@@ -207,7 +207,7 @@ def spider(com_name,cik,up_okCount):
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{news_url}:新增一条')
except Exception as e:
log.error(f'传输失败:{social_code}----{news_url}-----{e}')
log.error(f'数据库传输失败:{social_code}----{news_url}-----{e}')
e = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -64,6 +64,7 @@ class YahooCaiwu(object):
doc_items = pq(resp1_table[1]).children()
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
if resp1_table:
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
......@@ -376,6 +377,10 @@ class YahooCaiwu(object):
#对比指标计算
def calculateIndexReq(self):
get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
try:
params={
'type':2
......@@ -399,6 +404,7 @@ if __name__ == '__main__':
# parse_excel()
#get_content1()
yahoo=YahooCaiwu()
while True:
securitiescode=''
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -58,12 +58,14 @@ class YahooCaiwu(object):
# 雅虎财经处理表格
def deal_table(self,doc_resp):
try:
all_dict = {}
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
if resp1_table:
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
......@@ -126,6 +128,10 @@ class YahooCaiwu(object):
all_dict['表头'] = catalogue_title
all_dict['目录'] = catalogue_dict
all_dict['内容'] = content_dict
except Exception as e:
all_dict['表头'] = {}
all_dict['目录'] = {}
all_dict['内容'] = {}
return all_dict
......@@ -157,7 +163,8 @@ class YahooCaiwu(object):
conn,cursor=self.conn11()
try:
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' """ # and stock_code = "SYNH"
# sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='RAIZ4.SA' """ # and stock_code = "SYNH"
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' and any_data='0' """ # and stock_code = "SYNH"
# sql1 = f"select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='{securitiescode}' " # and stock_code = "SYNH"
cursor.execute(sql1)
result_data = cursor.fetchall()
......@@ -396,6 +403,7 @@ class YahooCaiwu(object):
print('调用接口成功!!')
except:
print('调用失败!')
if __name__ == '__main__':
# parse_excel()
#get_content1()
......
# import redis
#
#
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#
# # 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# res = r.exists(f_key)
# value = list(r.smembers(f_key))
# # 对列表进行排序
# value.sort()
# # 遍历排序后的列表
# list_data = []
# for member in value:
# member = member.decode()
# members = member.strip('[').strip(']').replace('\'','').strip().split(',')
# #获取每一个报告期
# for date in members:
# data = date.strip()
# # print(date.strip())
# list_data.append(data)
# # 放入redis
# for item in list_data:
# r.sadd(key, item)
#
# # 获取Set中的所有元素
# items = r.smembers(key)
# # print(items)
# print("======================================")
import re
from urllib.parse import quote, unquote
import requests
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import json
import difflib
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import datetime
timestamp = 1688054400 # 示例时间戳
date = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
headers={
'Connection':'keep-alive',
'Pragma':'no-cache',
'Cache-Control':'no-cache',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Referer':'https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'UOR=,finance.sina.com.cn,; SINAGLOBAL=123.149.3.173_1695815968.404462; Apache=123.149.3.173_1695815968.404463; ULV=1695816017391:2:2:2:123.149.3.173_1695815968.404463:1695815967476; lxlrttp=1578733570; U_TRS1=000000ad.bc7f83f51.651419db.690100f2; U_TRS2=000000ad.bc8a83f51.651419db.138fca70; SUB=_2AkMSSJVgf8NxqwFRmP0XzG3kbIxxyA_EieKkFGS7JRMyHRl-yD9kqhY-tRB6Oci7j27VGy6gikgIaUYBZsIPzk3PbLLC; hqEtagMode=1',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
def reqHtml(url):
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='GB18030'
text=res.text
return text
headers2={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'0',
'Cookie':'HWWAFSESID=fd8b573695b0ce804b; HWWAFSESTIME=1695799275143',
'Host':'www.qyyjt.cn',
'Origin':'https://www.qyyjt.cn',
'Pragma':'no-cache',
'Referer':'https://www.qyyjt.cn/detail/enterprise/overview?code=56CD928FAD278663E73BE7486C764DA7&type=company',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'client':'pc-web;pro',
'dataid':'869',
'pcuss':'eyJ0eXAiOiJKV1QiLCJ0eXBlIjoiand0IiwiYWxnIjoiSFMyNTYifQ.eyJjcmVhdGVUaW1lIjoiMjAyMy0wOS0yNyAyMDoxODowMy40NDkiLCJleHAiOjE2OTU4MTc5ODMsInVzZXJJZCI6IjIwMjMwOTI3MTUyMzA0XzEzNTkyNDgxODM5IiwiZXhwaXJlZFRpbWUiOiIyMDIzLTA5LTI3IDIwOjMzOjAzLjQ0OSJ9.SouwRylKogHfJILh97JMnYRzcJuj2Hg30BmQa9gc-Nc',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'system':'new',
'terminal':'pc-web;pro',
'user':'847E223529194582C37A02EEEC8AC09F0D7AD12E40778D6CA9CFB91F69F8C537',
'ver':'20230914',
'x-request-id':'x1eCRO-X8D7',
}
def reqPostMsg(url):
res=requests.post(url,headers=headers2,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
def get_realurl(tmpurl):
try:
pattern='url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result=unquote(result)
else:
result=''
except:
result=''
return result
def getFormatedate(timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
print(date)
url='https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter'
ttext=reqHtml(url)
soup=BeautifulSoup(ttext,'html.parser')
tdoc=soup.select('div[class="tbl_wrap"]>table[class="data_tbl os_tbl"]')[0]
print(str(tdoc))
......@@ -461,7 +461,7 @@ def listPage():
}
]
for operand in operands:
logger.info(f'采集地域股票信息{operand}')
rego=operand['operands'][1]
#第一次请求获取地域总共有的股票代码数量
try:
stockmsg=reqmsg(0,operand)
......@@ -469,21 +469,23 @@ def listPage():
except Exception as e:
logger.info(f'region该地域没有股票信息{operand}')
continue
logger.info(f'采集地域股票信息{rego}---对应的数量{total}')
for i in range(0,total,100):
logger.info(f"offset的值{i}")
stockmsg=reqmsg(i,operand)
if stockmsg:
try:
getStock(stockmsg)
getStock(stockmsg,rego)
except Exception as e:
logger.info(f"解析失败{e}")
time.sleep(3)
def getStock(stockmsg):
def getStock(stockmsg,rego):
quotes=stockmsg['finance']['result'][0]['quotes']
for quote in quotes:
symbol=quote['symbol']
logger.info(f"{rego}地区对应的股票代码{symbol}")
try:
longName=quote['longName']
except:
......
import json
import json
......@@ -37,7 +37,7 @@ def convert_size(size_bytes):
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
......@@ -55,7 +55,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try:
name = pdf_name + '.pdf'
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
......@@ -113,12 +113,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
# id = ''
# return id
# else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
......@@ -277,9 +277,9 @@ def InsterInto(social_code, pdf_url,pub_time):
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#判断文件是否已经存在obs服务器中
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
now_time = time.strftime("%Y-%m")
file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
file_path = 'QYNotice/'+pdf_name+'.pdf'
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
log.info('=====文件不存在obs=====')
......@@ -372,19 +372,23 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
except:
pass
# # 先获取页数
# page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
#
# total = re.findall(r'\d+', page)[0]
#
# r_page = int(total) % 15
# if r_page == 0:
# Maxpage = int(total) // 15
# else:
# Maxpage = int(total) // 15 + 1
# log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
total = re.findall(r'\d+', page)[0]
r_page = int(total) % 15
if r_page == 0:
Maxpage = int(total) // 15
else:
Maxpage = int(total) // 15 + 1
log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# # 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1,51):
if Maxpage < 50:
pass
else:
Maxpage = 50
for i in range(1,Maxpage):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
......
......@@ -533,12 +533,12 @@ class BaseCore:
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time)
create_time,path,'zzsn')
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
......@@ -558,11 +558,11 @@ class BaseCore:
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,pathType,file_name):
def uptoOBS(self,file_href,item_id,file_name):
headers = {}
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
......@@ -583,7 +583,7 @@ class BaseCore:
pass
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', f'{pathType}' + file_name, content=response.content)
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
break
except:
time.sleep(3)
......
......@@ -91,7 +91,8 @@ def save_data(dic_news):
'网址': dic_news['sourceAddress'],
'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate']
'创建时间': dic_news['createDate'],
'带标签内容':dic_news['contentWithTag'][:100]
}
db_storage.insert_one(aaa_dic)
......@@ -138,6 +139,7 @@ def remove_dup():
# 国务院文件
def get_content1():
pathType = 'policy/gwywj/'
def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
......@@ -256,7 +258,7 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1766')
retData = baseCore.uptoOBS(file_href,'1766',pathType,file_name)
if retData['state']:
pass
else:
......@@ -265,7 +267,7 @@ def get_content1():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
......@@ -308,6 +310,7 @@ def get_content1():
# 国务院部门文件
def get_content2():
pathType = 'policy/gwybmwj/'
def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy()
pageNo = 1
......@@ -336,6 +339,7 @@ def get_content2():
session.keep_alive = False
start_time = time.time()
num = 0
count = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
'人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
'国家卫生健康委员会',
......@@ -396,6 +400,9 @@ def get_content2():
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
......@@ -407,7 +414,7 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1699')
retData = baseCore.uptoOBS(file_href,'1699',pathType,file_name)
if retData['state']:
pass
else:
......@@ -416,7 +423,7 @@ def get_content2():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
except:
log.error(f'{title}...{href}获取内容失败')
continue
......@@ -446,6 +453,7 @@ def get_content2():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
......@@ -454,10 +462,11 @@ def get_content2():
log.error(f'{bmfl}...获取页数失败')
continue
end_time = time.time()
log.info(f'共抓取国务院部门文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布
def get_content3():
pathType = 'policy/gyzc/'
def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=headers, verify=False)
......@@ -499,6 +508,9 @@ def get_content3():
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}----{title}----内容为空----')
return
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
......@@ -510,7 +522,7 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642')
retData = baseCore.uptoOBS(file_href,'1642',pathType,file_name)
if retData['state']:
pass
else:
......@@ -519,7 +531,7 @@ def get_content3():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
......@@ -542,7 +554,7 @@ def get_content3():
'summary': '', #摘要
'title': title #标题
}
# print(title)
# log.info(title)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
......@@ -550,6 +562,7 @@ def get_content3():
def partTwo():
start_time = time.time()
num = 0
count = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
......@@ -570,12 +583,14 @@ def get_content3():
continue
sendContent(href, headers,title,pub_time,num)
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
def partOne():
start_time = time.time()
num = 0
count = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try:
# get请求,需要取消ssl验证
......@@ -603,10 +618,11 @@ def get_content3():
continue
sendContent(href, headers,title,pub_time,num)
num += 1
count += 1
except:
pass
end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
partOne()
# 增量执行需要注释掉partTwo()
......@@ -614,7 +630,7 @@ def get_content3():
# 北京
def bei_jing():
num = 0
start_time = time.time()
pathType = 'policy/beijing/'
# 有反爬需要使用selenium
......@@ -662,6 +678,7 @@ def bei_jing():
time.sleep(2)
log.info(f'------{len(hrefs)}条数据-------------')
num = 0
count = 0
for href in hrefs:
id_list = []
title = href[1]
......@@ -700,12 +717,15 @@ def bei_jing():
soup = paserUrl(soup_cont, href[0])
soup.prettify()
if soup.text == '' or soup.text == 'None':
log.info(f'----{href[0]}----{title}----内容为空----')
continue
# todo:去掉扫一扫
try:
soup.find('div', id='div_div').decompose()
except:
continue
# print(title)
# log.info(title)
fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup:
......@@ -756,11 +776,10 @@ def bei_jing():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
bro.quit()
except Exception as e:
log.info(e)
......@@ -827,6 +846,9 @@ def nei_meng_gu():
else:
i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
content = str(i_content)
if i_content.text == '' or i_content.text == 'None':
log.info(f'{real_href}------{title}----内容为空-----')
continue
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian))
......@@ -849,7 +871,7 @@ def nei_meng_gu():
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id)
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -892,6 +914,7 @@ def ji_lin():
pathType = 'policy/jilin/'
start = time.time()
num = 0
count = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
try:
resp_text = requests.get(url=url, headers=headers, verify=False)
......@@ -964,6 +987,9 @@ def ji_lin():
i_content = soup
contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 发文字号
find_hao = i_content.find_all('p')[:3]
pub_hao = ''
......@@ -1010,6 +1036,9 @@ def ji_lin():
p.extract()
contentWithTag = i_content
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 找到附件上传至文件服务器
fj_soup = i_soup.find('div', class_='wenjianfujian')
fj_list = fj_soup.find_all('a')
......@@ -1040,7 +1069,7 @@ def ji_lin():
soup.find('div', id='qr_container').decompose()
else:
pass
print(title)
log.info(title)
# print('............................................................')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -1073,13 +1102,14 @@ def ji_lin():
if flag:
save_data(dic_news)
num = num + 1
count += 1
except Exception as e:
log.info(e)
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海
......@@ -1087,6 +1117,7 @@ def shang_hai():
start = time.time()
pathType = 'policy/shanghai/'
num = 0
count =0
for page in range(1, 7):
if page == 1:
......@@ -1111,7 +1142,7 @@ def shang_hai():
num+=1
continue
try:
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
# href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
href_text = requests.get(url=href, headers=headers, verify=False).text
doc_href = pq(href_text)
doc_href_ = BeautifulSoup(href_text, 'html.parser')
......@@ -1120,6 +1151,9 @@ def shang_hai():
info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
pub_source = info_list[1].find('b').text.split('信息来源:')[1]
content = doc_href_.find('div', attrs={'class': 'detail_03'})
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
# 将文章中的附件字段删去
pattern = r'\d+\.'
......@@ -1181,7 +1215,7 @@ def shang_hai():
else:
continue
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -1209,18 +1243,19 @@ def shang_hai():
if flag:
save_data(dic_news)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 浙江
def zhe_jiang():
start = time.time()
pathType = 'policy/zhejiang/'
num = 0
count = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try:
res = requests.get(url, headers).content
......@@ -1235,7 +1270,7 @@ def zhe_jiang():
href = li.find('a')['href']
pub_time = li.find('a').find('span').text
title = li.find('a').text.replace(pub_time, '').strip()
# print(title)
# log.info(title)
if 'http' in href:
href = href
else:
......@@ -1302,9 +1337,12 @@ def zhe_jiang():
# fj_href_list.append(fujian_href)
# print(fj_href_list)
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
dic_news = {
'attachmentIds': [],
'author': '',
......@@ -1329,20 +1367,21 @@ def zhe_jiang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 福建
def fu_jian():
error_tag = str(404)
pathType = 'policy/fujian/'
num = 0
count = 0
start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
try:
......@@ -1386,8 +1425,8 @@ def fu_jian():
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href)
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print(real_href)
is_href = db_storage.find_one({'网址': real_href})
if is_href:
num+=1
......@@ -1437,6 +1476,7 @@ def fu_jian():
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
print(fj_href)
# 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
if retData['state']:
......@@ -1453,6 +1493,9 @@ def fu_jian():
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
except:
......@@ -1460,6 +1503,9 @@ def fu_jian():
pub_time = ''
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -1484,18 +1530,19 @@ def fu_jian():
'summary': '',
'title': title
}
# print(dic_news)
# log.info(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 山东
def shan_dong():
......@@ -1505,6 +1552,7 @@ def shan_dong():
}
start = time.time()
num = 0
count = 0
url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
for url in url_list:
try:
......@@ -1539,6 +1587,9 @@ def shan_dong():
# print(pub_time,pub_source,pub_hao)
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if pub_hao == '无':
p_list = content.find_all('p')
for p in p_list:
......@@ -1571,6 +1622,9 @@ def shan_dong():
i = i + 1
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1597,23 +1651,22 @@ def shan_dong():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
if content == '' or content == 'None':
continue
else:
print(title)
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 广东
def guang_dong():
start = time.time()
pathType = 'policy/guangdong/'
num = 0
count = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try:
resp_href = requests.get(url=url, headers=headers, verify=False)
......@@ -1653,6 +1706,9 @@ def guang_dong():
i_soup = paserUrl(i_soup, href)
content = i_soup.find('div', attrs={'class', 'box_info'})
contentwithTag = str(content)
if content == '' or content == None:
log.info(f'{href}-----{title}----内容为空----')
continue
fu_jian_list = content.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -1701,15 +1757,15 @@ def guang_dong():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 海南
def hai_nan():
......@@ -1717,6 +1773,7 @@ def hai_nan():
def hai_nan1():
# 部门文件
num = 0
count = 0
start_time = time.time()
for page in range(13):
if page == 0:
......@@ -1770,6 +1827,9 @@ def hai_nan():
except:
pass
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -1811,6 +1871,9 @@ def hai_nan():
topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip()
contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = source.find_all('a')
try:
for fu_jian in fu_jian_list:
......@@ -1862,6 +1925,9 @@ def hai_nan():
topicClassification = ''
contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1888,19 +1954,20 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
count += 1
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def hai_nan2():
def hai_nan_sw(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -1936,6 +2003,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1961,10 +2031,11 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
print(title)
log.info(title)
num += 1
count += 1
href_text.close()
except:
pass
req.close()
......@@ -1972,6 +2043,7 @@ def hai_nan():
def hai_nan_szf(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -2010,6 +2082,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except:
# print(href)
pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
......@@ -2021,6 +2096,9 @@ def hai_nan():
writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2068,10 +2146,12 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -2079,6 +2159,7 @@ def hai_nan():
def hai_nan_szfbgt(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -2127,6 +2208,9 @@ def hai_nan():
writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
if fu_jian_list:
for fu_jian in fu_jian_list:
......@@ -2147,7 +2231,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id)
fu_jian['href'] = full_path
print(f'----附件:{fu_jian_href}')
# print(f'----附件:{fu_jian_href}')
else:
pass
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -2176,10 +2260,10 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
print(title)
log.info(title)
num += 1
count += 1
href_text.close()
except:
pass
req.close()
......@@ -2187,6 +2271,7 @@ def hai_nan():
def hai_nan_zy(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.content, 'html.parser')
......@@ -2240,6 +2325,9 @@ def hai_nan():
pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{i_href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -2266,10 +2354,12 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -2277,6 +2367,7 @@ def hai_nan():
def start():
num = 0
count = 0
start_time = time.time()
url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
try:
......@@ -2306,7 +2397,7 @@ def hai_nan():
else:
page_href = str(url) + f'home_{page}.htm'
try:
num += hai_nan_zy(page_href)
count += hai_nan_zy(page_href)
except:
pass
time.sleep(1)
......@@ -2320,7 +2411,7 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_sw(page_href)
count += hai_nan_sw(page_href)
except:
pass
elif url == leibie_href_list[2]:
......@@ -2332,7 +2423,7 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_szf(page_href)
count += hai_nan_szf(page_href)
except:
pass
else:
......@@ -2343,22 +2434,22 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_szfbgt(page_href)
count += hai_nan_szfbgt(page_href)
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
start()
hai_nan1()
hai_nan2()
# 四川
def si_chuan():
num = 0
count = 0
pathType = 'policy/sichuan/'
start_time = time.time()
for page in range(1, 3):
......@@ -2393,6 +2484,9 @@ def si_chuan():
doc_href = paserUrl(doc_href, href)
contentWithTag = doc_href.find('div', id='scrollBox')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = doc_href.find_all('a')
for fu_jian in fu_jian_list:
......@@ -2441,19 +2535,20 @@ def si_chuan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
count += 1
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 广西
def guang_xi():
num = 0
count = 0
pathType = 'policy/guangxi/'
start_time = time.time()
url_all = """
......@@ -2519,6 +2614,9 @@ def guang_xi():
contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
......@@ -2568,14 +2666,14 @@ def guang_xi():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 贵州
def gui_zhou():
......@@ -2585,6 +2683,7 @@ def gui_zhou():
"""
pathType = 'policy/guizhou/'
num = 0
count = 0
start_time = time.time()
for page in range(0, 11):
if page == 0:
......@@ -2630,6 +2729,9 @@ def gui_zhou():
contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2678,8 +2780,8 @@ def gui_zhou():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
log.info(title)
count += 1
num = num + 1
except:
pass
......@@ -2697,6 +2799,7 @@ def yun_nan():
http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1
"""
num = 0
count = 0
start_time = time.time()
for page in range(1, 6):
if page == 1:
......@@ -2735,6 +2838,9 @@ def yun_nan():
contentwithTag = \
doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2793,18 +2899,20 @@ def yun_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
num = num + 1
count += 1
except:
pass
resp.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def yun_nan2():
num = 0
count = 0
start_time = time.time()
for page in range(1, 4):
if page == 1:
......@@ -2828,7 +2936,7 @@ def yun_nan():
num+=1
continue
try:
print(href)
# print(href)
if '.shtml' in href:
res_ = requests.get(href, headers)
page_text_ = res_.text.encode("ISO-8859-1")
......@@ -2847,6 +2955,9 @@ def yun_nan():
pub_hao = ''
contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'})
content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2857,7 +2968,7 @@ def yun_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
print(fu_jian_href)
# print(fu_jian_href)
try:
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
......@@ -2876,9 +2987,7 @@ def yun_nan():
elif 'display' in href:
continue
else:
content = ''
contentwithTag = ''
pub_hao = ''
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -2907,8 +3016,8 @@ def yun_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
log.info(title)
count += 1
num = num + 1
except:
pass
......@@ -2916,7 +3025,7 @@ def yun_nan():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
yun_nan1()
yun_nan2()
......@@ -2928,6 +3037,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
"""
num = 0
count = 0
pathType = 'policy/chongqing/'
start_time = time.time()
for page in range(0, 4):
......@@ -2955,7 +3065,7 @@ def chong_qing():
num+=1
continue
try:
print(href)
# print(href)
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
href_text = requests.get(url=href, headers=headers, verify=False).content
doc_href = pq(href_text)
......@@ -2978,6 +3088,9 @@ def chong_qing():
pass
contentWithTag = doc_href.find('div', class_='zwxl-article')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except:
origin = ''
topicClassification = ''
......@@ -2986,7 +3099,9 @@ def chong_qing():
pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
# print(fu_jian_list)
for fu_jian in fu_jian_list:
......@@ -3039,21 +3154,22 @@ def chong_qing():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
log.info(title)
count += 1
num += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 天津
def tian_jin():
pathType = 'policy/tianjin/'
def tian_jin1():
num = 0
count = 0
start_time = time.time()
for page in range(0, 3):
if page == 0:
......@@ -3139,7 +3255,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3167,18 +3285,20 @@ def tian_jin():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2():
"""
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
"""
num = 0
count =0
start_time = time.time()
for page in range(0, 5):
if page == 0:
......@@ -3263,7 +3383,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3291,15 +3413,17 @@ def tian_jin():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3():
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
if page == 1:
......@@ -3391,7 +3515,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3419,12 +3545,13 @@ def tian_jin():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1()
tian_jin2()
......@@ -3435,6 +3562,7 @@ def xin_jiang():
pathType = 'policy/xinjiang/'
def xin_jiang1():
num = 0
count = 0
start_time = time.time()
for page in range(1, 10):
if page == 1:
......@@ -3493,6 +3621,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3528,15 +3659,17 @@ def xin_jiang():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def xin_jiang_jsbt():
num = 0
count = 0
start_time = time.time()
for page in range(1, 6):
if page == 1:
......@@ -3592,6 +3725,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3627,6 +3763,7 @@ def xin_jiang():
if flag:
save_data(dic_news)
num += 1
count += 1
href_res.close()
except:
pass
......@@ -3634,7 +3771,7 @@ def xin_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
xin_jiang1()
xin_jiang_jsbt()
......@@ -3643,6 +3780,7 @@ def xin_jiang():
def shan_xi():
pathType = 'policy/shanxi/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 7):
if page == 1:
......@@ -3712,6 +3850,9 @@ def shan_xi():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3747,17 +3888,19 @@ def shan_xi():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 辽宁
def liao_ning():
pathType = 'policy/liaoning/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
......@@ -3823,6 +3966,9 @@ def liao_ning():
if len(contentWithTag) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3858,6 +4004,7 @@ def liao_ning():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
......@@ -3869,6 +4016,7 @@ def liao_ning():
def hei_long_jiang():
pathType = 'policy/heilongjiang/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
......@@ -3926,6 +4074,9 @@ def hei_long_jiang():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3953,6 +4104,7 @@ def hei_long_jiang():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
......@@ -3960,11 +4112,12 @@ def hei_long_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 江苏
def jiang_su():
num = 0
count = 0
pathType = 'policy/jiangsu/'
start_time = time.time()
pagestart = 1
......@@ -4034,6 +4187,9 @@ def jiang_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
......@@ -4068,18 +4224,20 @@ def jiang_su():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 安徽
def an_hui():
pathType = 'policy/anhui/'
def an_hui1():
num = 0
count = 0
start_time = time.time()
for page in range(1, 4):
url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
......@@ -4137,6 +4295,9 @@ def an_hui():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4164,15 +4325,17 @@ def an_hui():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def an_hui2():
num = 0
count = 0
start_time = time.time()
for page in range(1, 25):
url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
......@@ -4233,6 +4396,9 @@ def an_hui():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4260,6 +4426,7 @@ def an_hui():
if flag:
save_data(dic_news)
num += 1
count += 1
href_res.close()
except:
pass
......@@ -4267,7 +4434,7 @@ def an_hui():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
an_hui1()
an_hui2()
......@@ -4280,6 +4447,7 @@ def jiang_xi():
121-164
"""
num = 0
count = 0
pathType = 'policy/jiangxi/'
start_time = time.time()
startrecord = 1
......@@ -4360,6 +4528,9 @@ def jiang_xi():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
......@@ -4395,16 +4566,18 @@ def jiang_xi():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 河南
def he_nan():
num = 0
count = 0
pathType = 'policy/henan/'
start_time = time.time()
for page in range(0, 7):
......@@ -4456,6 +4629,9 @@ def he_nan():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -4489,16 +4665,18 @@ def he_nan():
if flag:
save_data(dic_news)
num += 1
count += 1
href_res.close()
resp_text.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖南
def hu_nan():
num = 0
count = 0
pathType = 'policy/hunan/'
start_time = time.time()
for page in range(1, 7):
......@@ -4565,6 +4743,9 @@ def hu_nan():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4592,18 +4773,20 @@ def hu_nan():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 甘肃
def gan_su():
pathType = 'policy/gansu/'
def gan_su1():
num = 0
count = 0
start_time = time.time()
bro = getDriver()
urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
......@@ -4686,6 +4869,9 @@ def gan_su():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -4715,6 +4901,7 @@ def gan_su():
if flag:
save_data(dic_news)
num += 1
count += 1
except Exception as e:
print(e)
pass
......@@ -4724,6 +4911,7 @@ def gan_su():
def gan_su2():
num = 0
count = 0
start_time = time.time()
bro = getDriver()
url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
......@@ -4821,6 +5009,9 @@ def gan_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2:
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
......@@ -4852,6 +5043,7 @@ def gan_su():
if flag:
save_data(dic_news)
num += 1
count += 1
except Exception as e:
print(e)
except Exception as e:
......@@ -4859,10 +5051,11 @@ def gan_su():
pass
bro.quit()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def gan_su3():
num = 0
count = 0
start_time = time.time()
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
......@@ -4979,6 +5172,9 @@ def gan_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2:
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
......@@ -5010,13 +5206,14 @@ def gan_su():
if flag:
save_data(dic_news)
num += 1
count += 1
except Exception as e:
print(e)
except:
pass
bro.quit()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1()
gan_su2()
......@@ -5025,6 +5222,7 @@ def gan_su():
# 宁夏
def ning_xia():
num = 0
count = 0
pathType = 'policy/ningxia/'
start_time = time.time()
for page in range(0, 3):
......@@ -5082,6 +5280,9 @@ def ning_xia():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -5111,16 +5312,18 @@ def ning_xia():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 陕西
def shanxi():
num = 0
count = 0
pathType = 'policy/shan_xi/'
start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
......@@ -5184,6 +5387,9 @@ def shanxi():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5211,6 +5417,7 @@ def shanxi():
if flag:
save_data(dic_news)
num += 1
count += 1
res_href.close()
except:
pass
......@@ -5218,7 +5425,7 @@ def shanxi():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 西藏
def xi_zang():
......@@ -5228,6 +5435,7 @@ def xi_zang():
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list:
num = 0
count = 0
try:
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
......@@ -5256,6 +5464,9 @@ def xi_zang():
contentWithTag = str(i_soup.find(id='NewsContent'))
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5306,18 +5517,20 @@ def xi_zang():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 青海
def qing_hai():
pathType = 'policy/qinghai/'
def qing_hai1():
num = 0
count = 0
start_time = time.time()
url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
try:
......@@ -5353,6 +5566,9 @@ def qing_hai():
origin = str(page.find('div', attrs={'class': 'foot-fb'}))
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5364,7 +5580,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681')
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5405,15 +5621,17 @@ def qing_hai():
# print(id)
# id_list.append(id)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def qing_hai2():
num = 0
count = 0
start_time = time.time()
urls = [
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
......@@ -5446,6 +5664,7 @@ def qing_hai():
durl = tr.find('a').get('href')
is_href = db_storage.find_one({'网址': durl})
if is_href:
num+=1
log.info('已采集----------跳过')
continue
title = tr.find('a').text
......@@ -5471,6 +5690,9 @@ def qing_hai():
origin = ''
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5482,7 +5704,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681')
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5490,7 +5712,7 @@ def qing_hai():
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器
......@@ -5523,13 +5745,14 @@ def qing_hai():
# print(id)
# id_list.append(id)
num += 1
count += 1
except:
pass
res.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
qing_hai1()
qing_hai2()
......@@ -5537,6 +5760,8 @@ def qing_hai():
# 河北
def he_bei():
num = 0
count = 0
pathType = 'policy/hebei/'
start_time = time.time()
url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
try:
......@@ -5551,6 +5776,7 @@ def he_bei():
href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
continue
pub_time_ = info['updated']
m = round(pub_time_ / 1000) # 四舍五入取10位时间戳(秒级)
......@@ -5569,7 +5795,7 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1668')
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5577,13 +5803,16 @@ def he_bei():
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -5619,14 +5848,17 @@ def he_bei():
if flag:
save_data(dic_news)
num += 1
count += 1
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖北
def hu_bei():
num = 0
count = 0
pathType = 'policy/hubei/'
start_time = time.time()
hrefs = []
url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
......@@ -5649,6 +5881,7 @@ def hu_bei():
for href in hrefs:
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
continue
try:
driver.get(href)
......@@ -5684,7 +5917,7 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1675')
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5692,14 +5925,16 @@ def hu_bei():
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5727,48 +5962,49 @@ def hu_bei():
if flag:
save_data(dic_news)
num += 1
count += 1
except Exception as e:
pass
driver.close()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__':
# get_content1()
# get_content2()
# get_content3()
# bei_jing()
# nei_meng_gu()
get_content1()
get_content2()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
# shan_dong()
# guang_dong()
# hai_nan()
# si_chuan()
# guang_xi()
# gui_zhou()
# yun_nan()
# chong_qing()
# tian_jin()
# xin_jiang()
# shan_xi()
# liao_ning()
# hei_long_jiang()
# jiang_su()
# an_hui()
# jiang_xi()
# he_nan()
# hu_nan()
# gan_su()
# ning_xia()
# xi_zang()
# shanxi()
# qing_hai()
# he_bei()
# qing_hai()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
si_chuan()
guang_xi()
gui_zhou()
yun_nan()
chong_qing()
tian_jin()
xin_jiang()
shan_xi()
liao_ning()
hei_long_jiang()
jiang_su()
an_hui()
jiang_xi()
he_nan()
hu_nan()
gan_su()
ning_xia()
xi_zang()
shanxi()
qing_hai()
he_bei()
qing_hai()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
......@@ -40,7 +40,8 @@ def save_data(dic_news):
'网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate']
'创建时间':dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100]
}
db_storage.insert_one(aaa_dic)
......
# -*- coding: utf-8 -*-
import os
import random
import sys
import time
import logbook
import logbook.more
# 核心工具包
import pymysql
from tqdm import tqdm
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
__cnx_proxy =None
__cursor_proxy = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
def close(self):
try:
self.__cursor_proxy.close()
self.__cnx_proxy.close()
except :
pass
def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
def get_proxy(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
ip_list.append(proxy)
return ip_list
def get_proxyIPPort(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxy = {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
ip_list.append(proxy)
return ip_list
\ No newline at end of file
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_sougou
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
#coding=utf-8
from urllib.parse import urljoin
import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote, unquote
from pyquery import PyQuery as pq
class JrttnewsSpider(object):
def __init__(self,searchkw,wordsCode,sid):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.sogou.com/'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
conx,cursorM=self.connMysql()
for item in items:
nowtime=self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
except Exception as e:
self.logger.info("数据插入数据库失败!")
finally:
self.closeSql(conx,cursorM)
def connMysql(self):
# 创建MySQL连接
conx = pymysql.connect(host=self.config.get('mysql', 'host'),
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'))
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(self,conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
# 解析页面
def parse_page(self):
self.logger.info('解析今日头条列表页')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
title=''
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
publishTag=''
detailmsg={
'title':title,
'detailUrl':detailUrl,
'sourceTag':sourceTag,
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
def getNowDate(self):
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
#智能抽取
def paserDetail(self,detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
# element = html2element(detailhtml)
except:
article_content={}
return article_content
#解析时间
def paserTime(self,publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '-' in publishtime:
time_format = '%Y-%m-%d'
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
def reqHtml(self,url):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'tt_webid=7283314732298225163; _ga=GA1.1.1730036912.1695778874; _tea_utm_cache_4916=undefined; _S_DPR=1; _S_IPAD=0; s_v_web_id=verify_ln12yyu3_qeLMwQ8s_Offy_4w8b_9kv1_hMDj7V2H2wuE; msToken=7l75aR51vcmcW4LxtvP1cUt2trK37XA-oZdZRTD2Are065KuEBsofVz7vcQ7kFRXkKXY-I0ydJEkpNrx1_XWuurUFWTyIxMuf8Xg5dg-; _ga_QEHZPBE5HH=GS1.1.1695778874.1.1.1695778928.0.0.0; ttwid=1%7C13mqlyEtsSnqRlDNgTCNya74xNS4Azg1-cqxvZ2aJQs%7C1695778929%7C6462d58bd323e4560a0f5db0c443e767a3716878843c0f9a1dec190be930fa37; _S_WIN_WH=1366_353',
'Host':'so.toutiao.com',
'Pragma':'no-cache',
'Referer':'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=%E6%B5%99%E6%B1%9F%E5%9B%BD%E6%9C%89%E8%B5%84%E6%9C%AC%E8%BF%90%E8%90%A5%E5%85%AC%E5%8F%B8&pd=information&action_type=pagination&page_num=1&search_id=202309270941439BB9AFF54062FE7CAC13&from=news&cur_tab_title=news',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
try:
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
except Exception as e:
text=''
return text
def get_realurl(self,tmpurl):
try:
pattern='url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result=unquote(result)
else:
result=''
except:
result=''
return result
def getFormatedate(self,timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
totalnum=3
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
url=f'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword={keyword}&pd=information&action_type=pagination&page_num={pagenum}&from=news&cur_tab_title=news'
lhtml=self.reqHtml(url)
soup = BeautifulSoup(lhtml, 'html.parser')
result_contents=soup.select('div[class="s-result-list"]')
for lists in result_contents:
doc=pq(str(lists))
listcontent=doc.find('div[class="result-content"]')
for litag in listcontent:
try:
lidoc=pq(litag)
ahref=lidoc.find('a[class="text-ellipsis text-underline-hover"]').attr('href')
durl=self.get_realurl(ahref)
title=lidoc.find('a[class="text-ellipsis text-underline-hover"]').text().replace('\n','')
source=lidoc.find('div[class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]>span:nth-child(1)').text().replace('\n','')
publishdate=lidoc.find('div[class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]>span:last-child').text().replace('\n','')
publishdate=self.paserTime(publishdate)
if isinstance(publishdate, str):
pubdate=publishdate
else:
pubdate=publishdate.strftime("%Y-%m-%d %H:%M:%S")
is_member = self.r.sismember('pysouhunews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubdate
}
self.detailList.put(detailmsg)
except Exception as e:
print(e)
continue
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(3)
self.logger.info("详情页解析异常!"+detailUrl)
else:
break
# time.sleep(5)
#解析详情
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
currentdate=self.getNowDate()
kword=self.searchkw
publishDate=detailmsg['publishTag']
publishDate=publishDate+''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={
'title':detailmsg['title'],
'source':detailmsg['sourceTag'],
'detailurl':detailurl,
'content':content,
'contentHtml':contentWithTag,
'publishtime':publishDate,
'currentdate':currentdate,
'kword':kword
}
return detailmsg
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
html=''
try:
driver.get(url)
# 等待页面加载完成
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
finally:
driver.quit()
return html
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
driver=self.createDriver()
driver.get(url)
# 设置等待时间为10秒
wait = WebDriverWait(driver, 10)
# 等待元素加载完成
element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
raw_html=driver.page_source
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('div[class="article-content"]>article')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
self.logger.info("定位解析失败!")
if content:
return content,contentWithTag
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
self.logger.info("抽取解析失败!")
return content,contentWithTag
def detect_language(self,html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(self,processitem):
try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
finally:
producer.close()
def run(self):
# # 获取每页URL
# c = threading.Thread(target=self.get_page_html)
# c.start()
# c.join()
# # 解析详情页
# t = threading.Thread(target=self.get_detail_html)
# t.start()
self.get_page_html
if __name__ == '__main__':
zhuce = JrttnewsSpider()
zhuce.run()
# zhuce.driver.close()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from jrttnewspider import SouhunewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class SouhunewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
try:
searchEngines=keymsg['searchEngines']
except Exception as e:
searchEngines=[]
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
souhunewsSpider=SouhunewsSpider(searchkw,wordsCode,sid)
try:
souhunewsSpider.get_page_html()
except Exception as e:
try:
souhunewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if souhunewsSpider.detailList.qsize() != 0:
try:
souhunewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
souhunewsTaskJob=SouhunewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
print('---------------')
while True:
try:
try:
keymsg=souhunewsTaskJob.getkafka()
kwList=souhunewsTaskJob.paserKeyMsg(keymsg)
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from jrttnewspider import JrttnewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class JrttnewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
# try:
# searchEngines=keymsg['searchEngines']
# if 'java.util.ArrayList' in searchEngines:
# searchEngines=searchEngines[1]
# except Exception as e:
# searchEngines=[]
kwList=[]
searchEngines=['3']
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
logger.info('+++++')
else:
logger.info('+++++searchEngines为空')
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
return kwList
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
jrttnewsSpider=JrttnewsSpider(searchkw,wordsCode,sid)
try:
jrttnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0:
try:
jrttnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
import random
if __name__ == '__main__':
jrttnewsTaskJob=JrttnewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
# ss='(中国机床工具工业协会|中国内燃机工业协会|中国机电工业价格协会|中国机械电子兵器船舶工业档案学会|中国仪器仪表行业协会|中国工程机械工业协会|中国文化办公设备制造行业协会|中国机械工业金属切削刀具技术协会|中国机械工业教育协会|中国汽车工业协会|中国机械通用零部件工业协会|中国环保机械行业协会|中国模具工业协会|中国机械工业勘察设计协会|中国机械制造工艺协会|中国机械工业审计学会|中国轴承工业协会|中国机电一体化技术应用协会|中国机械工程学会|中国液压气动密封件工业协会|中国铸造协会|中国通用机械工业协会|中国锻压协会|中国制冷空调工业协会|中国热处理行业协会|中国电工技术学会|中国仪器仪表学会|中国石油和石油化工设备工业协会|中国表面工程协会|中国食品和包装机械工业协会|中国焊接协会|中国汽车工程学会|中国塑料机械工业协会|中国机械工业企业管理协会|中国印刷及设备器材工业协会|中国机械工业质量管理协会|中国电器工业协会|中国机械工业安全卫生协会|中国重型机械工业协会|中国机械工业标准化技术协会|中国机械工业职工思想政治工作研究会|中国农业机械工业协会|中国机电装备维修与改造技术协会 |机械工业信息研究院|机械工业教育发展中心|机械工业经济管理研究院|机械工业信息中心|机械工业人才开发服务中心|机械工业北京电工技术经济研究所|机械工业技术发展基金会|机械工业哈尔滨焊接技术培训中心|机械工业仪器仪表综合技术经济研究所)+(私收会费|私吞|肆意牟利|损失浪费|索贿|贪财|贪官污吏|贪污|违背组织原则|违法|违纪|为官不廉|为政擅权|窝案|舞弊|泄露国家机密|信鬼神|性关系|虚假信息|虚假招标|隐瞒不报|隐瞒真相|营私|鬻爵|主动投案|资产流失|钻空子|钻漏洞|被调查|被双开|不担当|不老实|不良影响|不正当|不作为|超标准建设|超标准装修|吃空饷|吃拿卡要|渎职|对党不忠诚|非法批地|腐败|腐虫|腐化堕落|公车私用|公费开销|公款吃喝|公款出境|公款旅游|勾结|官迷心窍|好色|回扣|贿赂|挤占挪用|纪律审查|监察调查|监守自盗|践踏法律|接受审查调查|截留克扣|开除党籍|开除公职|抗议|利欲熏心|敛财|乱摊派|乱作为|落马|落网|买官|买卖审批权限|卖官|谋取暴利|谋取私利|目无法纪|幕后交易|弄虚作假|挪用公款|骗取|钱色交易|潜规则|侵害权益|侵吞公款|侵占挪用|圈子文化|权利扭曲|权钱交易|权色交易|山头主义|涉案|生活糜烂|生活奢靡|失察|失管|收送|受贿|双规|双开|私分|私人会所|私设小金库|负面|下降|违规|不利|亏损|上诉|不法|不良名单|停职|公开谴责|公诉|内幕交易|刑事拘留|刑事责任|刑拘|判决|判刑|判赔|司法处置|合同纠纷|处分|处罚|强制执行|仲裁|伪造|伪造公章|投案|投诉|拘留|接受调查|控诉|查封|涉嫌|涉诉监察调查|纠纷|经营异常名录|缉捕|罚单|罚款|罚金|罪犯|自首|获刑|行贿|警示函|贪腐|违约金|追究刑责|造假|逮捕|非法|非法集资判决书|申诉|纠纷|通报|开除|留党察看|追债|逃债|资产负债率|情色交易|搞权钱|曝光|黑料|重罚|虚假报告|侵犯)'
# keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
print('---------------')
while True:
try:
codeList=[]
codeList.append('KW-20220602-0003')
for codeid in codeList:
try:
keymsg=jrttnewsTaskJob.getkeyFromredis(codeid)
kwList=jrttnewsTaskJob.paserKeyMsg(keymsg)
if len(kwList)<1:
continue
logger.info(f"需要搜索的关键词:{kwList}")
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(jrttnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install redis==4.3.5 -i https://pypi.douban.com/simple
pip install kafka-python==2.0.2 -i https://pypi.douban.com/simple
pip install PyMySQL -i https://pypi.douban.com/simple
pip install gne==0.3.0 -i https://pypi.douban.com/simple
pip install selenium==4.9.1 -i https://pypi.douban.com/simple
pip install logbook -i https://pypi.douban.com/simple
pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
pip install --upgrade selenium
pip install --upgrade urllib3
pip3 uninstall urllib3
ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0i 14 Aug 2018'. See: https://github.com/urllib3/urllib3/issues/2168
\ No newline at end of file
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'zh': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn' or lang_code == 'zh':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test():
# 测试:按URL采集
url_list = [
# "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# 韩文:中央日报-politics
# "https://www.joongang.co.kr/article/25094974",
# "https://www.joongang.co.kr/article/25094967",
# 英文:加德满都邮报-national-security
# "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# 俄语:今日白俄罗斯报-word
# "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# 阿语
# "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# 测试提取标题
# "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# 'https://3g.k.sohu.com/t/n705260979' #天眼查--企业公告'
# 'https://baijiahao.baidu.com/s?id=1769415116218226935'
# 'https://m.gelonghui.com/community/post/1678728#ocr'
'http://epaper.zqrb.cn/html/2023-05/27/content_950333.htm'
]
# 语言编码
lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
for url in url_list:
print()
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# 测试转换为JSON
# 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# print(json.dumps(extraction_result))
# print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
# 打印测试结果
print_extraction_result(extraction_result)
def extract_by_html_test():
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# 打印测试结果
print_extraction_result(extraction_result)
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
extract_by_url_test()
# 测试:按HTML采集
# extract_by_html_test()
except Exception as e:
print("采集失败:", e)
# -*- coding: utf-8 -*-
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
title baidu_comm
chcp 65001
cd /d %~dp0
python baidutaskJob.py
\ No newline at end of file
......@@ -169,15 +169,13 @@ class QQnewsTaskJob(object):
qqnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
if qqnewsSpider.detailList.qsize() != 0:
try:
qqnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
......
# -*- coding: utf-8 -*-
import os
import random
import sys
import time
import logbook
import logbook.more
# 核心工具包
import pymysql
from tqdm import tqdm
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
__cnx_proxy =None
__cursor_proxy = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
def close(self):
try:
self.__cursor_proxy.close()
self.__cnx_proxy.close()
except :
pass
def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
def get_proxy(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
ip_list.append(proxy)
return ip_list
def get_proxyIPPort(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxy = {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
ip_list.append(proxy)
return ip_list
\ No newline at end of file
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_sougou
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install redis==4.3.5 -i https://pypi.douban.com/simple
pip install kafka-python==2.0.2 -i https://pypi.douban.com/simple
pip install PyMySQL -i https://pypi.douban.com/simple
pip install gne==0.3.0 -i https://pypi.douban.com/simple
pip install selenium==4.9.1 -i https://pypi.douban.com/simple
pip install logbook -i https://pypi.douban.com/simple
pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
pip install --upgrade selenium
pip install --upgrade urllib3
pip3 uninstall urllib3
ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0i 14 Aug 2018'. See: https://github.com/urllib3/urllib3/issues/2168
\ No newline at end of file
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'zh': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn' or lang_code == 'zh':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test():
# 测试:按URL采集
url_list = [
# "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# 韩文:中央日报-politics
# "https://www.joongang.co.kr/article/25094974",
# "https://www.joongang.co.kr/article/25094967",
# 英文:加德满都邮报-national-security
# "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# 俄语:今日白俄罗斯报-word
# "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# 阿语
# "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# 测试提取标题
# "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# 'https://3g.k.sohu.com/t/n705260979' #天眼查--企业公告'
# 'https://baijiahao.baidu.com/s?id=1769415116218226935'
# 'https://m.gelonghui.com/community/post/1678728#ocr'
'http://epaper.zqrb.cn/html/2023-05/27/content_950333.htm'
]
# 语言编码
lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
for url in url_list:
print()
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# 测试转换为JSON
# 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# print(json.dumps(extraction_result))
# print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
# 打印测试结果
print_extraction_result(extraction_result)
def extract_by_html_test():
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# 打印测试结果
print_extraction_result(extraction_result)
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
extract_by_url_test()
# 测试:按HTML采集
# extract_by_html_test()
except Exception as e:
print("采集失败:", e)
# -*- coding: utf-8 -*-
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
#coding=utf-8
from urllib.parse import urljoin
import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from urllib.parse import quote, unquote
class SouhunewsSpider(object):
def __init__(self,searchkw,wordsCode,sid):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.sogou.com/'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
conx,cursorM=self.connMysql()
for item in items:
nowtime=self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
except Exception as e:
self.logger.info("数据插入数据库失败!")
finally:
self.closeSql(conx,cursorM)
def connMysql(self):
# 创建MySQL连接
conx = pymysql.connect(host=self.config.get('mysql', 'host'),
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'))
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(self,conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
# 解析页面
def parse_page(self):
self.logger.info('解析搜狗列表页')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
title=''
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
publishTag=''
detailmsg={
'title':title,
'detailUrl':detailUrl,
'sourceTag':sourceTag,
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
def getNowDate(self):
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
#智能抽取
def paserDetail(self,detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
# element = html2element(detailhtml)
except:
article_content={}
return article_content
#解析时间
def paserTime(self,publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '-' in publishtime:
time_format = '%Y-%m-%d'
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
def reqSouhuHtml(self,url):
headers={
'Connection':'keep-alive',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'Accept':'application/json, text/javascript, */*; q=0.01',
'X-Requested-With':'XMLHttpRequest',
'sec-ch-ua-mobile':'?0',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua-platform':'"Windows"',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://search.sohu.com/?keyword=%E6%B5%99%E6%B1%9F%E5%9B%BD%E6%9C%89%E8%B5%84%E6%9C%AC%E8%BF%90%E8%90%A5%E5%85%AC%E5%8F%B8&type=10002&ie=utf8&queryType=default&spm=smpc.channel_258.search-box.1695794576553aEKIAK5_1090',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'SUV=1695794576771hvzq2n; clt=1695794576; cld=20230927140256; t=1695794594569; reqtype=pc; gidinf=x099980109ee17b02bbffd42800081cb2f516277b38e',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
try:
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
except Exception as e:
text=''
return text
def get_realurl(self,tmpurl):
try:
pattern='url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result=unquote(result)
else:
result=''
except:
result=''
return result
def getFormatedate(self,timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
totalnum=3
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
pn=pagenum*10
url=f'https://search.sohu.com/search/meta?keyword={keyword}&terminalType=pc&spm-pre=smpc.channel_258.search-box.1695794576553aEKIAK5_1090&SUV=1695794576771hvzq2n&from={pn}&size=10&searchType=news&queryType=outside&queryId=16957949390005rWZ006&pvId=1695794594446PVP9iHO&refer=http%3A//news.sohu.com/&size=10&maxL=15&spm=&_=1695794594452'
lhtml=self.reqSouhuHtml(url)
lljson=json.loads(lhtml)
dmsgList=lljson['data']['news']
for dd in dmsgList:
try:
source=dd['authorName']
title=dd['title']
durl=dd['url']
ptime=dd['postTime']
try:
pubdate=self.getFormatedate(int(ptime / 1000))
except Exception as e:
pubdate=''
is_member = self.r.sismember('pysouhunews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubdate
}
self.detailList.put(detailmsg)
except Exception as e :
print(e)
continue
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(3)
self.logger.info("详情页解析异常!"+detailUrl)
else:
break
# time.sleep(5)
#解析详情
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
currentdate=self.getNowDate()
kword=self.searchkw
publishDate=detailmsg['publishTag']
publishDate=publishDate+''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={
'title':detailmsg['title'],
'source':detailmsg['sourceTag'],
'detailurl':detailurl,
'content':content,
'contentHtml':contentWithTag,
'publishtime':publishDate,
'currentdate':currentdate,
'kword':kword
}
return detailmsg
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
html=''
try:
driver.get(url)
# 等待页面加载完成
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
finally:
driver.quit()
return html
def reqHtml(self,url):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_qpsvr_localtk=0.13653936306726644; RK=GPtQWVDskM; ptcz=5f36ee88c33e1060914663a0a68c2fc547d594312b58222b351428c8ba8bba1f; uin=o2468741258; pac_uid=1_2468741258; iip=0; ad_play_index=20; ss=1',
'Host':'new.qq.com',
'Pragma':'no-cache',
'Referer':'https://new.qq.com/search?query=%E6%B5%99%E6%B1%9F%E5%9B%BD%E6%9C%89%E8%B5%84%E6%9C%AC%E8%BF%90%E8%90%A5%E5%85%AC%E5%8F%B8&page=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
raw_html=self.reqSouhuHtml(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('article[class="article"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
self.logger.info("定位解析失败!")
if content:
return content,contentWithTag
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
if content:
return content,contentWithTag
try:
raw_html=self.webDriver(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('article[class="article"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
self.logger.info("定位解析失败!")
if content:
return content,contentWithTag
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
except Exception as e:
try:
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
return content,contentWithTag
def detect_language(self,html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(self,processitem):
try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
finally:
producer.close()
def run(self):
# # 获取每页URL
# c = threading.Thread(target=self.get_page_html)
# c.start()
# c.join()
# # 解析详情页
# t = threading.Thread(target=self.get_detail_html)
# t.start()
self.get_page_html
if __name__ == '__main__':
zhuce = SouhunewsSpider()
zhuce.run()
# zhuce.driver.close()
\ No newline at end of file
#coding=utf-8
from urllib.parse import urljoin
import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
class QQnewsSpider(object):
def __init__(self,searchkw,wordsCode,sid):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.sogou.com/'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
conx,cursorM=self.connMysql()
for item in items:
nowtime=self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
except Exception as e:
self.logger.info("数据插入数据库失败!")
finally:
self.closeSql(conx,cursorM)
def connMysql(self):
# 创建MySQL连接
conx = pymysql.connect(host=self.config.get('mysql', 'host'),
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'))
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(self,conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
# 解析页面
def parse_page(self):
self.logger.info('解析搜狗列表页')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
title=''
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
publishTag=''
detailmsg={
'title':title,
'detailUrl':detailUrl,
'sourceTag':sourceTag,
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
def getNowDate(self):
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
#智能抽取
def paserDetail(self,detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
# element = html2element(detailhtml)
except:
article_content={}
return article_content
#解析时间
def paserTime(self,publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '-' in publishtime:
time_format = '%Y-%m-%d'
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# 获取每一页数据, 开趴.
def get_page_html(self):
self.logger.info("进入搜狗首页...")
self.driver.get(self.url)
self.driver.find_element(By.ID, 'query').send_keys(self.searchkw)
self.driver.find_element(By.ID, 'stb').click()
wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3)
self.driver.find_element(By.ID, 'sogou_news').click()
wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3)
self.logger.info("开始抓取首页...")
try:
flag, lists = self.parse_page()
if len(lists)<1:
return
except Exception as e:
time.sleep(5)
return
if len(lists)==0:
time.sleep(5)
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@id="sogou_next"]//text()')[0]
hasnext = hasnext.strip()
timeFlag=False
while hasnext == '下一页':
try:
if self.page_num==2:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
try:
self.driver.find_element(By.XPATH, '//a[@id="sogou_next"]').click()
except Exception as e:
time.sleep(5)
continue
time.sleep(5)
flag, lists = self.parse_page()
if len(lists)<1:
break
for detail in lists:
publishTag=detail['publishTag']
# if publishTag:
# pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
# needDate='2022-01-01 00:00:00'
# needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
# if pubtime < needTime:
# timeFlag = True
# break
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
if timeFlag:
break
try:
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@id="sogou_next"]//text()')[0]
hasnext = hasnext.strip()
except Exception as e:
hasnext=''
except Exception as e:
time.sleep(5)
break
self.logger.info("抓取完毕")
#获取资讯内容信息
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
currentdate=self.getNowDate()
kword=self.searchkw
publishDate=detailmsg['publishTag']
publishDate=publishDate+''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={
'title':detailmsg['title'],
'source':detailmsg['sourceTag'],
'detailurl':detailurl,
'content':content,
'contentHtml':contentWithTag,
'publishtime':publishDate,
'currentdate':currentdate,
'kword':kword
}
return detailmsg
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
html=''
try:
driver.get(url)
# 等待页面加载完成
# wait = WebDriverWait(self.driver, 20)
# wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
finally:
driver.quit()
return html
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
try:
lang=self.detect_language(title)
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
try:
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
return content,contentWithTag
def detect_language(self,html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
# # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js)
# try:
# self.driver.get(detailUrl)
# except Exception as e:
# self.driver.quit()
# self.driver=self.createDriver()
# self.driver.get(detailUrl)
#
# response = self.driver.page_source
# bdetail=self.getDetailmsg(response,detailmsg)
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pysougou_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(3)
self.logger.info("详情页解析异常!"+detailUrl)
else:
break
# time.sleep(5)
# def getDetailmsg(self,detailhtml,detailmsg):
# try:
# detailurl=detailmsg['detailUrl']
# article_content=self.paserDetail(detailhtml,detailurl)
# content=article_content['content']
# contentWithTag=article_content['body_html']
# except Exception as e:
# self.logger.info('内容抽取失败')
# content=''
# contentWithTag=''
# currentdate=self.getNowDate()
# kword=self.searchkw
# publishtime=detailmsg['publishTag']
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
# detailmsg={
# 'title':detailmsg['title'],
# 'source':detailmsg['sourceTag'],
# 'detailurl':detailurl,
# 'content':content,
# 'contentHtml':contentWithTag,
# 'publishtime':publishDate,
# 'currentdate':currentdate,
# 'kword':kword
# }
# return detailmsg
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(self,processitem):
try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
finally:
producer.close()
def run(self):
# 获取每页URL
c = threading.Thread(target=self.get_page_html)
c.start()
c.join()
# 解析详情页
t = threading.Thread(target=self.get_detail_html)
t.start()
if __name__ == '__main__':
zhuce = QQnewsSpider()
# zhuce.run()
# zhuce.driver.close()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from souhunewspider import SouhunewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class SouhunewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
try:
searchEngines=keymsg['searchEngines']
except Exception as e:
searchEngines=[]
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
souhunewsSpider=SouhunewsSpider(searchkw,wordsCode,sid)
try:
souhunewsSpider.get_page_html()
except Exception as e:
try:
souhunewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if souhunewsSpider.detailList.qsize() != 0:
try:
souhunewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
souhunewsTaskJob=SouhunewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
print('---------------')
while True:
try:
try:
keymsg=souhunewsTaskJob.getkafka()
kwList=souhunewsTaskJob.paserKeyMsg(keymsg)
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from souhunewspider import SouhunewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class SouhunewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
# try:
# searchEngines=keymsg['searchEngines']
# if 'java.util.ArrayList' in searchEngines:
# searchEngines=searchEngines[1]
# except Exception as e:
# searchEngines=[]
kwList=[]
searchEngines=['3']
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
logger.info('+++++')
else:
logger.info('+++++searchEngines为空')
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
return kwList
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
qqnewsSpider=SouhunewsSpider(searchkw,wordsCode,sid)
try:
qqnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if qqnewsSpider.detailList.qsize() != 0:
try:
qqnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
import random
if __name__ == '__main__':
souhunewsTaskJob=SouhunewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
# ss='(中国机床工具工业协会|中国内燃机工业协会|中国机电工业价格协会|中国机械电子兵器船舶工业档案学会|中国仪器仪表行业协会|中国工程机械工业协会|中国文化办公设备制造行业协会|中国机械工业金属切削刀具技术协会|中国机械工业教育协会|中国汽车工业协会|中国机械通用零部件工业协会|中国环保机械行业协会|中国模具工业协会|中国机械工业勘察设计协会|中国机械制造工艺协会|中国机械工业审计学会|中国轴承工业协会|中国机电一体化技术应用协会|中国机械工程学会|中国液压气动密封件工业协会|中国铸造协会|中国通用机械工业协会|中国锻压协会|中国制冷空调工业协会|中国热处理行业协会|中国电工技术学会|中国仪器仪表学会|中国石油和石油化工设备工业协会|中国表面工程协会|中国食品和包装机械工业协会|中国焊接协会|中国汽车工程学会|中国塑料机械工业协会|中国机械工业企业管理协会|中国印刷及设备器材工业协会|中国机械工业质量管理协会|中国电器工业协会|中国机械工业安全卫生协会|中国重型机械工业协会|中国机械工业标准化技术协会|中国机械工业职工思想政治工作研究会|中国农业机械工业协会|中国机电装备维修与改造技术协会 |机械工业信息研究院|机械工业教育发展中心|机械工业经济管理研究院|机械工业信息中心|机械工业人才开发服务中心|机械工业北京电工技术经济研究所|机械工业技术发展基金会|机械工业哈尔滨焊接技术培训中心|机械工业仪器仪表综合技术经济研究所)+(私收会费|私吞|肆意牟利|损失浪费|索贿|贪财|贪官污吏|贪污|违背组织原则|违法|违纪|为官不廉|为政擅权|窝案|舞弊|泄露国家机密|信鬼神|性关系|虚假信息|虚假招标|隐瞒不报|隐瞒真相|营私|鬻爵|主动投案|资产流失|钻空子|钻漏洞|被调查|被双开|不担当|不老实|不良影响|不正当|不作为|超标准建设|超标准装修|吃空饷|吃拿卡要|渎职|对党不忠诚|非法批地|腐败|腐虫|腐化堕落|公车私用|公费开销|公款吃喝|公款出境|公款旅游|勾结|官迷心窍|好色|回扣|贿赂|挤占挪用|纪律审查|监察调查|监守自盗|践踏法律|接受审查调查|截留克扣|开除党籍|开除公职|抗议|利欲熏心|敛财|乱摊派|乱作为|落马|落网|买官|买卖审批权限|卖官|谋取暴利|谋取私利|目无法纪|幕后交易|弄虚作假|挪用公款|骗取|钱色交易|潜规则|侵害权益|侵吞公款|侵占挪用|圈子文化|权利扭曲|权钱交易|权色交易|山头主义|涉案|生活糜烂|生活奢靡|失察|失管|收送|受贿|双规|双开|私分|私人会所|私设小金库|负面|下降|违规|不利|亏损|上诉|不法|不良名单|停职|公开谴责|公诉|内幕交易|刑事拘留|刑事责任|刑拘|判决|判刑|判赔|司法处置|合同纠纷|处分|处罚|强制执行|仲裁|伪造|伪造公章|投案|投诉|拘留|接受调查|控诉|查封|涉嫌|涉诉监察调查|纠纷|经营异常名录|缉捕|罚单|罚款|罚金|罪犯|自首|获刑|行贿|警示函|贪腐|违约金|追究刑责|造假|逮捕|非法|非法集资判决书|申诉|纠纷|通报|开除|留党察看|追债|逃债|资产负债率|情色交易|搞权钱|曝光|黑料|重罚|虚假报告|侵犯)'
# keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
print('---------------')
while True:
try:
codeList=[]
codeList.append('KW-20220602-0003')
for codeid in codeList:
try:
keymsg=souhunewsTaskJob.getkeyFromredis(codeid)
kwList=souhunewsTaskJob.paserKeyMsg(keymsg)
if len(kwList)<1:
continue
logger.info(f"需要搜索的关键词:{kwList}")
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
title baidu_comm
chcp 65001
cd /d %~dp0
python baidutaskJob.py
\ No newline at end of file
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_sougou
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install redis==4.3.5 -i https://pypi.douban.com/simple
pip install kafka-python==2.0.2 -i https://pypi.douban.com/simple
pip install PyMySQL -i https://pypi.douban.com/simple
pip install gne==0.3.0 -i https://pypi.douban.com/simple
pip install selenium==4.9.1 -i https://pypi.douban.com/simple
pip install logbook -i https://pypi.douban.com/simple
pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
pip install --upgrade selenium
pip install --upgrade urllib3
pip3 uninstall urllib3
ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0i 14 Aug 2018'. See: https://github.com/urllib3/urllib3/issues/2168
\ No newline at end of file
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'zh': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn' or lang_code == 'zh':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test():
# 测试:按URL采集
url_list = [
# "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# 韩文:中央日报-politics
# "https://www.joongang.co.kr/article/25094974",
# "https://www.joongang.co.kr/article/25094967",
# 英文:加德满都邮报-national-security
# "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# 俄语:今日白俄罗斯报-word
# "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# 阿语
# "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# 测试提取标题
# "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# 'https://3g.k.sohu.com/t/n705260979' #天眼查--企业公告'
# 'https://baijiahao.baidu.com/s?id=1769415116218226935'
# 'https://m.gelonghui.com/community/post/1678728#ocr'
'http://epaper.zqrb.cn/html/2023-05/27/content_950333.htm'
]
# 语言编码
lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
for url in url_list:
print()
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# 测试转换为JSON
# 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# print(json.dumps(extraction_result))
# print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
# 打印测试结果
print_extraction_result(extraction_result)
def extract_by_html_test():
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# 打印测试结果
print_extraction_result(extraction_result)
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
extract_by_url_test()
# 测试:按HTML采集
# extract_by_html_test()
except Exception as e:
print("采集失败:", e)
# -*- coding: utf-8 -*-
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
title baidu_comm
chcp 65001
cd /d %~dp0
python baidutaskJob.py
\ No newline at end of file
#coding=utf-8
from urllib import parse
from urllib.parse import urljoin
from pyquery import PyQuery as pq
import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from retry import retry
from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from urllib.parse import quote, unquote
class QQnewsSpider(object):
def __init__(self,searchkw,wordsCode,sid):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.sogou.com/'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
conx,cursorM=self.connMysql()
for item in items:
nowtime=self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
except Exception as e:
self.logger.info("数据插入数据库失败!")
finally:
self.closeSql(conx,cursorM)
def connMysql(self):
# 创建MySQL连接
conx = pymysql.connect(host=self.config.get('mysql', 'host'),
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'))
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(self,conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
# 解析页面
def parse_page(self):
self.logger.info('解析搜狗列表页')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
title=''
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
publishTag=''
detailmsg={
'title':title,
'detailUrl':detailUrl,
'sourceTag':sourceTag,
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
def getNowDate(self):
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
#智能抽取
def paserDetail(self,detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
# element = html2element(detailhtml)
except:
article_content={}
return article_content
#解析时间
def paserTime(self,publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '-' in publishtime:
time_format = '%Y-%m-%d'
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
@retry(retry=3,delay=1)
def getrequest(self,session,real_url,keyword):
res=session.get(real_url,timeout=10)
res.encoding = res.apparent_encoding
text=res.text
if keyword not in text:
raise ValueError("Invalid value")
return text
def reqPostMsg(self,url,session,keyword):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
}
# session = requests.session()
URL = 'https://www.toutiao.com/'
session.get(URL, headers=headers)
real_url = url + f'wid_ct={time.time()*1000}' + 'dvpf=pc&source=input&keyword=' + keyword + '&pd=information&action_type=search_subtab_switch&page_num=0&from=news&cur_tab_title=news'
try:
text = self.getrequest(session,real_url)
except Exception as e:
text=''
return text
def get_realurl(tmpurl):
try:
pattern='url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result=unquote(result)
else:
result=''
except:
result=''
return result
def get_realurl(self,tmpurl):
try:
pattern = 'url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result = unquote(result)
else:
result = ''
except:
result = ''
return result
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
# url='https://i.news.qq.com/gw/pc_search/result'
url = 'https://so.toutiao.com/search?'
totalnum=3
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
session = requests.session()
qerhtml=self.reqPostMsg(url,session,keyword)
pattern = r'data-p="0">(.*?)</script>'
matches = re.findall(pattern, qerhtml)
if matches:
for match in matches:
print(match)
#todo:转成json格式
json_data = json.dumps(match)
else:
print("未找到匹配的字符串")
# secList=jsonmsg['secList']
# for sec in secList:
# try:
# title=sec['newsList'][0]['title']
# durl=sec['newsList'][0]['url']
# pubtime=sec['newsList'][0]['time']
# source=sec['newsList'][0]['source']
# is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
# if is_member:
# continue
# detailmsg={
# 'title':title,
# 'detailUrl':durl,
# 'sourceTag':source,
# 'publishTag':pubtime
# }
# self.detailList.put(detailmsg)
# except Exception as e :
# continue
session.close()
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(3)
self.logger.info("详情页解析异常!"+detailUrl)
else:
break
# time.sleep(5)
#解析详情
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
currentdate=self.getNowDate()
kword=self.searchkw
publishDate=detailmsg['publishTag']
publishDate=publishDate+''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={
'title':detailmsg['title'],
'source':detailmsg['sourceTag'],
'detailurl':detailurl,
'content':content,
'contentHtml':contentWithTag,
'publishtime':publishDate,
'currentdate':currentdate,
'kword':kword
}
return detailmsg
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
html=''
try:
driver.get(url)
# 等待页面加载完成
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
finally:
driver.quit()
return html
def reqHtml(self,url):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_qpsvr_localtk=0.13653936306726644; RK=GPtQWVDskM; ptcz=5f36ee88c33e1060914663a0a68c2fc547d594312b58222b351428c8ba8bba1f; uin=o2468741258; pac_uid=1_2468741258; iip=0; ad_play_index=20; ss=1',
'Host':'new.qq.com',
'Pragma':'no-cache',
'Referer':'https://new.qq.com/search?query=%E6%B5%99%E6%B1%9F%E5%9B%BD%E6%9C%89%E8%B5%84%E6%9C%AC%E8%BF%90%E8%90%A5%E5%85%AC%E5%8F%B8&page=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
raw_html=self.reqHtml(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('div[id="ArticleContent"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
self.logger.info("定位解析失败!")
if content:
return content,contentWithTag
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
if content:
return content,contentWithTag
try:
raw_html=self.webDriver(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('div[id="ArticleContent"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
self.logger.info("定位解析失败!")
if content:
return content,contentWithTag
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
except Exception as e:
try:
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
return content,contentWithTag
def detect_language(self,html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(self,processitem):
try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
finally:
producer.close()
def run(self):
# # 获取每页URL
# c = threading.Thread(target=self.get_page_html)
# c.start()
# c.join()
# # 解析详情页
# t = threading.Thread(target=self.get_detail_html)
# t.start()
self.get_page_html
if __name__ == '__main__':
zhuce = QQnewsSpider()
zhuce.run()
# zhuce.driver.close()
\ No newline at end of file
#coding=utf-8
from urllib.parse import urljoin
import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
class QQnewsSpider(object):
def __init__(self,searchkw,wordsCode,sid):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.sogou.com/'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
conx,cursorM=self.connMysql()
for item in items:
nowtime=self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
except Exception as e:
self.logger.info("数据插入数据库失败!")
finally:
self.closeSql(conx,cursorM)
def connMysql(self):
# 创建MySQL连接
conx = pymysql.connect(host=self.config.get('mysql', 'host'),
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'))
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(self,conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
# 解析页面
def parse_page(self):
self.logger.info('解析搜狗列表页')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
title=''
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
publishTag=''
detailmsg={
'title':title,
'detailUrl':detailUrl,
'sourceTag':sourceTag,
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
def getNowDate(self):
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
#智能抽取
def paserDetail(self,detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
# element = html2element(detailhtml)
except:
article_content={}
return article_content
#解析时间
def paserTime(self,publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '-' in publishtime:
time_format = '%Y-%m-%d'
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# 获取每一页数据, 开趴.
def get_page_html(self):
self.logger.info("进入搜狗首页...")
self.driver.get(self.url)
self.driver.find_element(By.ID, 'query').send_keys(self.searchkw)
self.driver.find_element(By.ID, 'stb').click()
wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3)
self.driver.find_element(By.ID, 'sogou_news').click()
wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3)
self.logger.info("开始抓取首页...")
try:
flag, lists = self.parse_page()
if len(lists)<1:
return
except Exception as e:
time.sleep(5)
return
if len(lists)==0:
time.sleep(5)
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@id="sogou_next"]//text()')[0]
hasnext = hasnext.strip()
timeFlag=False
while hasnext == '下一页':
try:
if self.page_num==2:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
try:
self.driver.find_element(By.XPATH, '//a[@id="sogou_next"]').click()
except Exception as e:
time.sleep(5)
continue
time.sleep(5)
flag, lists = self.parse_page()
if len(lists)<1:
break
for detail in lists:
publishTag=detail['publishTag']
# if publishTag:
# pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
# needDate='2022-01-01 00:00:00'
# needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
# if pubtime < needTime:
# timeFlag = True
# break
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
if timeFlag:
break
try:
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@id="sogou_next"]//text()')[0]
hasnext = hasnext.strip()
except Exception as e:
hasnext=''
except Exception as e:
time.sleep(5)
break
self.logger.info("抓取完毕")
#获取资讯内容信息
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
currentdate=self.getNowDate()
kword=self.searchkw
publishDate=detailmsg['publishTag']
publishDate=publishDate+''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={
'title':detailmsg['title'],
'source':detailmsg['sourceTag'],
'detailurl':detailurl,
'content':content,
'contentHtml':contentWithTag,
'publishtime':publishDate,
'currentdate':currentdate,
'kword':kword
}
return detailmsg
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
html=''
try:
driver.get(url)
# 等待页面加载完成
# wait = WebDriverWait(self.driver, 20)
# wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
finally:
driver.quit()
return html
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
try:
lang=self.detect_language(title)
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
try:
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
print('抽取失败!!')
return content,contentWithTag
def detect_language(self,html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
# # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js)
# try:
# self.driver.get(detailUrl)
# except Exception as e:
# self.driver.quit()
# self.driver=self.createDriver()
# self.driver.get(detailUrl)
#
# response = self.driver.page_source
# bdetail=self.getDetailmsg(response,detailmsg)
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pysougou_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(3)
self.logger.info("详情页解析异常!"+detailUrl)
else:
break
# time.sleep(5)
# def getDetailmsg(self,detailhtml,detailmsg):
# try:
# detailurl=detailmsg['detailUrl']
# article_content=self.paserDetail(detailhtml,detailurl)
# content=article_content['content']
# contentWithTag=article_content['body_html']
# except Exception as e:
# self.logger.info('内容抽取失败')
# content=''
# contentWithTag=''
# currentdate=self.getNowDate()
# kword=self.searchkw
# publishtime=detailmsg['publishTag']
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
# detailmsg={
# 'title':detailmsg['title'],
# 'source':detailmsg['sourceTag'],
# 'detailurl':detailurl,
# 'content':content,
# 'contentHtml':contentWithTag,
# 'publishtime':publishDate,
# 'currentdate':currentdate,
# 'kword':kword
# }
# return detailmsg
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(self,processitem):
try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
finally:
producer.close()
def run(self):
# 获取每页URL
c = threading.Thread(target=self.get_page_html)
c.start()
c.join()
# 解析详情页
t = threading.Thread(target=self.get_detail_html)
t.start()
if __name__ == '__main__':
zhuce = QQnewsSpider()
# zhuce.run()
# zhuce.driver.close()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from toutiaonewspider import QQnewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class QQnewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
try:
searchEngines=keymsg['searchEngines']
except Exception as e:
searchEngines=[]
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
qqnewsSpider=QQnewsSpider(searchkw,wordsCode,sid)
try:
qqnewsSpider.get_page_html()
except Exception as e:
try:
qqnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
if qqnewsSpider.detailList.qsize() != 0:
try:
qqnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
qqTaskJob=QQnewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
print('---------------')
while True:
try:
try:
keymsg=qqTaskJob.getkafka()
kwList=qqTaskJob.paserKeyMsg(keymsg)
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from toutiaonewspider import QQnewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class QQnewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
# try:
# searchEngines=keymsg['searchEngines']
# if 'java.util.ArrayList' in searchEngines:
# searchEngines=searchEngines[1]
# except Exception as e:
# searchEngines=[]
kwList=[]
searchEngines=['3']
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
logger.info('+++++')
else:
logger.info('+++++searchEngines为空')
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
return kwList
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
qqnewsSpider=QQnewsSpider(searchkw,wordsCode,sid)
try:
qqnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if qqnewsSpider.detailList.qsize() != 0:
try:
qqnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
import random
if __name__ == '__main__':
qqnewsTaskJob=QQnewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
# ss='(中国机床工具工业协会|中国内燃机工业协会|中国机电工业价格协会|中国机械电子兵器船舶工业档案学会|中国仪器仪表行业协会|中国工程机械工业协会|中国文化办公设备制造行业协会|中国机械工业金属切削刀具技术协会|中国机械工业教育协会|中国汽车工业协会|中国机械通用零部件工业协会|中国环保机械行业协会|中国模具工业协会|中国机械工业勘察设计协会|中国机械制造工艺协会|中国机械工业审计学会|中国轴承工业协会|中国机电一体化技术应用协会|中国机械工程学会|中国液压气动密封件工业协会|中国铸造协会|中国通用机械工业协会|中国锻压协会|中国制冷空调工业协会|中国热处理行业协会|中国电工技术学会|中国仪器仪表学会|中国石油和石油化工设备工业协会|中国表面工程协会|中国食品和包装机械工业协会|中国焊接协会|中国汽车工程学会|中国塑料机械工业协会|中国机械工业企业管理协会|中国印刷及设备器材工业协会|中国机械工业质量管理协会|中国电器工业协会|中国机械工业安全卫生协会|中国重型机械工业协会|中国机械工业标准化技术协会|中国机械工业职工思想政治工作研究会|中国农业机械工业协会|中国机电装备维修与改造技术协会 |机械工业信息研究院|机械工业教育发展中心|机械工业经济管理研究院|机械工业信息中心|机械工业人才开发服务中心|机械工业北京电工技术经济研究所|机械工业技术发展基金会|机械工业哈尔滨焊接技术培训中心|机械工业仪器仪表综合技术经济研究所)+(私收会费|私吞|肆意牟利|损失浪费|索贿|贪财|贪官污吏|贪污|违背组织原则|违法|违纪|为官不廉|为政擅权|窝案|舞弊|泄露国家机密|信鬼神|性关系|虚假信息|虚假招标|隐瞒不报|隐瞒真相|营私|鬻爵|主动投案|资产流失|钻空子|钻漏洞|被调查|被双开|不担当|不老实|不良影响|不正当|不作为|超标准建设|超标准装修|吃空饷|吃拿卡要|渎职|对党不忠诚|非法批地|腐败|腐虫|腐化堕落|公车私用|公费开销|公款吃喝|公款出境|公款旅游|勾结|官迷心窍|好色|回扣|贿赂|挤占挪用|纪律审查|监察调查|监守自盗|践踏法律|接受审查调查|截留克扣|开除党籍|开除公职|抗议|利欲熏心|敛财|乱摊派|乱作为|落马|落网|买官|买卖审批权限|卖官|谋取暴利|谋取私利|目无法纪|幕后交易|弄虚作假|挪用公款|骗取|钱色交易|潜规则|侵害权益|侵吞公款|侵占挪用|圈子文化|权利扭曲|权钱交易|权色交易|山头主义|涉案|生活糜烂|生活奢靡|失察|失管|收送|受贿|双规|双开|私分|私人会所|私设小金库|负面|下降|违规|不利|亏损|上诉|不法|不良名单|停职|公开谴责|公诉|内幕交易|刑事拘留|刑事责任|刑拘|判决|判刑|判赔|司法处置|合同纠纷|处分|处罚|强制执行|仲裁|伪造|伪造公章|投案|投诉|拘留|接受调查|控诉|查封|涉嫌|涉诉监察调查|纠纷|经营异常名录|缉捕|罚单|罚款|罚金|罪犯|自首|获刑|行贿|警示函|贪腐|违约金|追究刑责|造假|逮捕|非法|非法集资判决书|申诉|纠纷|通报|开除|留党察看|追债|逃债|资产负债率|情色交易|搞权钱|曝光|黑料|重罚|虚假报告|侵犯)'
# keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
print('---------------')
while True:
try:
codeList=[]
codeList.append('KW-20220602-0003')
for codeid in codeList:
try:
keymsg=qqnewsTaskJob.getkeyFromredis(codeid)
kwList=qqnewsTaskJob.paserKeyMsg(keymsg)
if len(kwList)<1:
continue
logger.info(f"需要搜索的关键词:{kwList}")
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
#coding=utf-8
#coding=utf-8
......@@ -402,6 +402,7 @@ class BaiduSpider(object):
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -190,7 +190,7 @@ if __name__ == '__main__':
while True:
try:
codeList=[]
codeList.append('KW-20230818-0003')
codeList.append('KW-20230925-0002')
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
......@@ -207,7 +207,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论