提交 43034e09 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

# -*- coding:utf-8 -*-
import datetime import datetime
import time import time
...@@ -26,11 +26,11 @@ def page_list(): ...@@ -26,11 +26,11 @@ def page_list():
header = { header = {
'Host':'xcx.www.gov.cn', 'Host':'xcx.www.gov.cn',
'Connection':'keep-alive', 'Connection':'keep-alive',
'Content-Length':'72', 'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw', 'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm', 'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957', 'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json', 'Content-Type':'application/json',
'xweb_xhr':'1', 'xweb_xhr':'1',
'dgd-pre-release':'0', 'dgd-pre-release':'0',
...@@ -40,9 +40,9 @@ def page_list(): ...@@ -40,9 +40,9 @@ def page_list():
'Sec-Fetch-Site':'cross-site', 'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors', 'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty', 'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html', 'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br', 'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh' 'Accept-Language':'zh-CN,zh;q=0.9'
} }
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList' url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,445): for i in range(1,445):
...@@ -79,23 +79,23 @@ def detailpaser(dmsg): ...@@ -79,23 +79,23 @@ def detailpaser(dmsg):
hh={ hh={
'Host':'xcx.www.gov.cn', 'Host':'xcx.www.gov.cn',
'Connection':'keep-alive', 'Connection':'keep-alive',
'Content-Length':'14', 'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw', 'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm', 'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957', 'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json', 'Content-Type':'application/json',
'xweb_xhr':'1', 'xweb_xhr':'1',
'dgd-pre-release':'0', 'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/detail/detail', 'x-yss-page':'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code':'4400', 'x-yss-city-code':'4400',
'Accept':'*/*', 'Accept':'*/*',
'Sec-Fetch-Site':'cross-site', 'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors', 'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty', 'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html', 'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br', 'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh' 'Accept-Language':'zh-CN,zh;q=0.9'
} }
try: try:
durl=dmsg['url'] durl=dmsg['url']
......
...@@ -74,19 +74,7 @@ class HgDownFile(object): ...@@ -74,19 +74,7 @@ class HgDownFile(object):
return cookie return cookie
#请求下载文件 #请求下载文件
def reqDownFile(self,data): def reqDownFile(self,data):
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control':'max-age=0',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'stats.customs.gov.cn',
'Origin':'http://stats.customs.gov.cn',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
'Cookie': self.getcookie()
}
data=data data=data
proxy={} proxy={}
# response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20) # response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
...@@ -95,19 +83,36 @@ class HgDownFile(object): ...@@ -95,19 +83,36 @@ class HgDownFile(object):
while statuscode != 200: while statuscode != 200:
# time.sleep(5) # time.sleep(5)
try: try:
# header={
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding':'gzip, deflate',
# 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# 'Cache-Control':'max-age=0',
# 'Content-Type':'application/x-www-form-urlencoded',
# 'Host':'stats.customs.gov.cn',
# 'Origin':'http://stats.customs.gov.cn',
# 'Proxy-Connection':'keep-alive',
# 'Upgrade-Insecure-Requests':'1',
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
# 'Cookie': self.getcookie()
# }
header={ header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate', 'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control':'max-age=0', 'Cache-Control':'max-age=0',
'Content-Length':'306',
'Content-Type':'application/x-www-form-urlencoded', 'Content-Type':'application/x-www-form-urlencoded',
'Host':'stats.customs.gov.cn', 'Host':'stats.customs.gov.cn',
'Origin':'http://stats.customs.gov.cn', 'Origin':'http://stats.customs.gov.cn',
'Proxy-Connection':'keep-alive', 'Proxy-Connection':'keep-alive',
#'Referer':'http://stats.customs.gov.cn/queryData/queryDataList?pageNum=1&codeLength=8&currentStartTime=202203&currentEndTime=202309&currentDateBySource=202309&selectTableState=3&orderType=CODE%20ASC%20DEFAULT&iEType=0&currencyType=usd&year=2022&startMonth=1&endMonth=11&monthFlag=&unitFlag=false&unitFlag1=false&outerField1=&outerField2=CODE_TS&outerField3=&outerField4=&outerValue1=&outerValue2=&outerValue3=&outerValue4=',
'Upgrade-Insecure-Requests':'1', 'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
'Cookie': self.getcookie() 'Cookie': self.getcookie()
} }
data_str = '&'.join([f"{key}={value}" for key, value in data.items()])
response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20) response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
# response.encoding = response.apparent_encoding # response.encoding = response.apparent_encoding
response.encoding = 'GB2312' response.encoding = 'GB2312'
...@@ -218,6 +223,7 @@ class HgDownFile(object): ...@@ -218,6 +223,7 @@ class HgDownFile(object):
# 2022年 1-1 202202 2 2022年 1月之前数据是 2 # 2022年 1-1 202202 2 2022年 1月之前数据是 2
# 2022年 1-2 202202 3 2022年的累计数据是 3 # 2022年 1-2 202202 3 2022年的累计数据是 3
# 2022年 2-2 202202 1 2022年 1月之后数据是 1 # 2022年 2-2 202202 1 2022年 1月之后数据是 1
selectTableState=2
if year<2022: if year<2022:
selectTableState= 2 #202202前的数据为2 后的数据是1 selectTableState= 2 #202202前的数据为2 后的数据是1
else: else:
...@@ -225,9 +231,11 @@ class HgDownFile(object): ...@@ -225,9 +231,11 @@ class HgDownFile(object):
e=int(endMonth) e=int(endMonth)
if year==2022 and s<e: #2022年累计数据单独设置参数 if year==2022 and s<e: #2022年累计数据单独设置参数
selectTableState= 3 selectTableState= 3
if e==2:
selectTableState= 2
elif year==2022 and e==1: elif year==2022 and e==1:
selectTableState= 2 selectTableState= 2
else: elif year==2022 and s==e:
selectTableState= 1 #202202前的数据为2 后的数据是1 selectTableState= 1 #202202前的数据为2 后的数据是1
param={ param={
'pageSize': 10, 'pageSize': 10,
...@@ -237,9 +245,9 @@ class HgDownFile(object): ...@@ -237,9 +245,9 @@ class HgDownFile(object):
'startMonth': startMonth, 'startMonth': startMonth,
'endMonth': endMonth, 'endMonth': endMonth,
'monthFlag':'', 'monthFlag':'',
'unitFlag': False, 'unitFlag': True,
'unitFlag1': False, 'unitFlag1': True,
'codeLength': '8', 'codeLength': 8,
'outerField1': outerField1, 'outerField1': outerField1,
'outerField2':'', 'outerField2':'',
'outerField3':'', 'outerField3':'',
...@@ -250,24 +258,26 @@ class HgDownFile(object): ...@@ -250,24 +258,26 @@ class HgDownFile(object):
'outerValue4':'', 'outerValue4':'',
'orderType': 'CODE ASC DEFAULT', 'orderType': 'CODE ASC DEFAULT',
'selectTableState': selectTableState, #202201前的数据为2 后的数据是1 'selectTableState': selectTableState, #202201前的数据为2 后的数据是1
'currentStartTime': '202202', 'currentStartTime': 202203,
} }
return param return param
#联合查询字段的参数设置 #联合查询字段的参数设置
def setcodesAndProductparam(self,iEType,currencyType,year,startMonth,endMonth,outerField1,filedCode): def setcodesAndProductparam(self,iEType,currencyType,year,startMonth,endMonth,outerField1,filedCode):
selectTableState= 1 #默认是1
if year<2022: if year<2022:
selectTableState= 2 #202202前的数据为2 后的数据是1 selectTableState= 2 #202203 前的数据为2
else: else:
s=int(startMonth) s=int(startMonth)
e=int(endMonth) e=int(endMonth)
if year==2022 and s<e: #2022年累计数据单独设置参数 if year==2022 and s<e: #2022年累计数据参数是3
selectTableState= 3 selectTableState= 3
if e==2:
selectTableState= 2
elif year==2022 and e==1: elif year==2022 and e==1:
selectTableState= 2 selectTableState= 2 #202203 1月的数据单月的参数是2
else: elif year==2022 and s==e:
selectTableState= 1 #202202前的数据为2 后的数据是1 selectTableState= 1 #202203除1月的数据单月的参数是1
param={ param={
'pageSize': 10, 'pageSize': 10,
'iEType': iEType, 'iEType': iEType,
...@@ -276,8 +286,8 @@ class HgDownFile(object): ...@@ -276,8 +286,8 @@ class HgDownFile(object):
'startMonth': startMonth, 'startMonth': startMonth,
'endMonth': endMonth, 'endMonth': endMonth,
'monthFlag':'', 'monthFlag':'',
'unitFlag': False, 'unitFlag': True,
'unitFlag1': False, 'unitFlag1': True,
'codeLength': '8', 'codeLength': '8',
'outerField1': outerField1, 'outerField1': outerField1,
'outerField2':'CODE_TS', 'outerField2':'CODE_TS',
...@@ -289,7 +299,7 @@ class HgDownFile(object): ...@@ -289,7 +299,7 @@ class HgDownFile(object):
'outerValue4':'', 'outerValue4':'',
'orderType': 'CODE ASC DEFAULT', 'orderType': 'CODE ASC DEFAULT',
'selectTableState': selectTableState, 'selectTableState': selectTableState,
'currentStartTime': '202202', 'currentStartTime': 202203,
} }
return param return param
......
...@@ -6,7 +6,7 @@ import json ...@@ -6,7 +6,7 @@ import json
import pymysql import pymysql
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from flask_cors import cross_origin from flask_cors import cross_origin
from urllib.parse import unquote
''' '''
...@@ -71,12 +71,15 @@ def index(): ...@@ -71,12 +71,15 @@ def index():
def get_news(): def get_news():
data=request.form data=request.form
@app.route('/task/setCookie', methods=['GET']) @app.route('/task/setCookie', methods=['GET','POST'])
# @cross_origin() @cross_origin()
def setCookie(): def setCookie():
try: try:
cookie = request.args.get('cookie') # cookie = request.args.get('cookie')
r.sadd('hgcookie',cookie) hgcookie = request.form.get('cookie')
hgcookie = unquote(hgcookie)
r.sadd('hgcookie',hgcookie)
print(f'setCookie添加cookie成功到redis{hgcookie}')
except Exception as e: except Exception as e:
print('error') print('error')
return 'succes' return 'succes'
...@@ -100,7 +103,7 @@ def getCookieSize(): ...@@ -100,7 +103,7 @@ def getCookieSize():
return jsonify(data) return jsonify(data)
@app.route('/task/getHtml', methods=['POST']) @app.route('/task/getHtml', methods=['POST'])
# @cross_origin() @cross_origin()
def getnewMonth(): def getnewMonth():
try: try:
html = request.form.get('html') html = request.form.get('html')
......
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'663',
'Content-Type':'application/json',
'Cookie':'SESSION=MGFhMGQxNDItM2MyOS00NjU5LWI2MTgtZjdiM2UxNjFkMGI3; _trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false',
'Host':'www.cq.gov.cn',
'Origin':'https://www.cq.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.cq.gov.cn/zwgk/search.html?DOCTITLE=REITs&DEPT=&gte=&lte=&REFERENCENO=&nh=&number=',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://www.cq.gov.cn/irs/front/list'
data={
"customFilter": {
"operator": "and",
"properties": [],
"filters": [
{
"operator": "or",
"properties": [
{
"property": "f_202121500898",
"operator": "eq",
"value": "REITs"
},
{
"property": "f_202142777829",
"operator": "eq",
"value": "REITs"
}
],
"filters": []
},
{
"operator": "or",
"properties": [
{
"property": "f_202146838317",
"operator": "gte",
"value": "2023-11-09 16:14:20"
},
{
"property": "f_202146235090",
"operator": "gte",
"value": "2023-11-09 16:14:20"
}
],
"filters": [
{
"operator": "and",
"properties": [
{
"property": "f_202146838317",
"operator": "eq",
"value": None
},
{
"property": "f_202146235090",
"operator": "eq",
"value": None
}
]
}
]
}
]
},
"sorts": [],
"tableName": "t_1775cd018c6",
"tenantId": "7",
"pageSize": 10,
"pageNo": i
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
try:
title=lmsg['f_202121500898']
subtitle=''
summary=lmsg['f_202142777829']
createDate=''
writeDate=''
pubDate=lmsg['save_time']
source=lmsg['f_202121437464']
durl=lmsg['doc_pub_url']
wenjianhao=lmsg['f_202121837479']
suoyihao=lmsg['f_202121273539']
content=''
siteweb='重庆市人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyihao':suoyihao,
}
is_member = rr.sismember('reis_cqgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_cqgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'重庆市人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false; _trs_user=',
'Host':'www.cq.gov.cn',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag,content=soupPaserHtml(soup,'div[class="zcwjk-xlcon"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="document mt-1 mt-12"]')
if not content:
log.info(f'详情内容为空:{durl}')
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_word"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'385',
'Content-Type':'application/json',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699513657; SEARCHHISTORY=[%22REiTs%22]',
'Host':'www.gxzf.gov.cn',
'Origin':'http://www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl='http://www.gxzf.gov.cn/irs/front/search'
data={
"code": "181aedaa542",
"dataTypeId": "241",
"configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7",
"searchWord": "REiTs",
"orderBy": "related",
"searchBy": "all",
"appendixType": "",
"granularity": "ALL",
"isSearchForced": "0",
"filters": [],
"pageNo": i,
"pageSize": 10,
"isAdvancedSearch": None,
"isDefaultAdvanced": None,
"advancedFilters": None,
"advancedFilters ": None,
"historySearchWords": [
"REiTs"
]
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['middle']['listAndBox']
for lmsgg in datas:
lmsg=lmsgg['data']
title=lmsg['title']
subtitle=''
summary=lmsg['table-7']
createDate=''
writeDate=lmsg['table-4']
pubDate=lmsg['time']
source=lmsg['table-3']
durl=lmsg['url']
docNumberStr=lmsg['table-5']
reNum=lmsg['table-1']
content=''
siteweb='广西壮族自治区人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gxgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gxgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广西壮族自治区人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; SEARCHHISTORY=[%22REiTs%22]; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699514234',
'Host':'www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
#class_type=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(1)>td:nth-child(4)')[1]
pub_jigou=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(2)')[1]
#file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
#pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="article-con"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
# detailmsg['class_type']=class_type
detailmsg['pub_jigou']=pub_jigou
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153159&searchUseTime-349',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl=f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={i}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
soup = BeautifulSoup(lcont, 'html.parser')
soup = reqbase.paserUrl(str(soup), lurl)
divlist=soup.select('div[id="showPage"]>div')
for lmsg in divlist:
title=soupPaserHtml(lmsg,'h3>a')[1]
subtitle=''
summary=''
createDate=''
writeDate=''
pubDate=soupPaserHtml(lmsg,'span[class="quily-con"]')[1]
source=soupPaserHtml(lmsg,'a[class="address-con permitU"]')[1]
try:
durl=soupPaserHtml(lmsg,'h3>a')[0].get('href')
except Exception as e:
durl=''
continue
docNumberStr=''
reNum=''
content=''
siteweb='海南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hainangov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hainangov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'海南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'HttpOnly=true; 4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153247&searchUseTime-337; HA_STICKY_apps=apps.srv34; Hm_lvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515700; yfx_c_g_u_id_10005682=_ck23110915414012919174127333485; yfx_f_l_v_t_10005682=f_t_1699515700292__r_t_1699515700292__v_t_1699515700292__r_c_0; _trs_uv=loqvrn5x_4549_5u3r; _trs_ua_s_1=loqvrn5x_4549_1lnl; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515718',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=1&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'解析详情地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
souyihao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(1)')[1]
fenlei=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(2)')[1]
fawenjiguan=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(2)')[1]
wenhao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(1)')[1]
pub_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(2)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[id="zoom"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['souyihao']=souyihao
detailmsg['fenlei']=fenlei
detailmsg['fawenjiguan']=fawenjiguan
detailmsg['wenhao']=wenhao
detailmsg['pub_data']=pub_data
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host':'searchapi.henan.gov.cn',
'Origin':'https://www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=10
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl=f'https://searchapi.henan.gov.cn/open/api/external?keywords=&siteId=4500000001&allKeyword=&anyKeyword=&noKeyword=&searchRange=-1000&sortType=200&beginTime=&endTime=&pageNumber={i}&pageSize=15&fileType=3&channelMarkId=45000000010115416542055691'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['datas']
for lmsg in datas:
title=lmsg['title']
subtitle=lmsg['subtitle']
summary=lmsg['summary']
createDate=lmsg['createDate']
writeDate=lmsg['writeDate']
pubDate=lmsg['pubDate']
source=lmsg['source']
durl=lmsg['selfUrl']
docNumberStr=lmsg['docNumberStr']
reNum=lmsg['reNum']
content=lmsg['content']
siteweb='河南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_henangov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_henangov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'zh_choose=n; yfx_c_g_u_id_10000001=_ck23110818022219777515353379336; yfx_f_l_v_t_10000001=f_t_1699437742968__r_t_1699437742968__v_t_1699437742968__r_c_0',
'Host':'www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/zwgk/fgwj/szfl/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag=soup.select('div[id="content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
return detailmsg
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'JSESSIONID=B970564BAAD37BB8E9EF19F78FF45618; Hm_lvt_5544783ae3e1427d6972d9e77268f25d=1699492684; token=43cfe913-ec04-4aa3-b037-96903ccaa188; uuid=43cfe913-ec04-4aa3-b037-96903ccaa188; Hm_lpvt_5544783ae3e1427d6972d9e77268f25d=1699492955; 924omrTVcFchT=0IZtGVa8M20F2wJXy_C6l9PPFZOO1SBDdB3qZtsaLbLGaQ5t4l6Vt8HF9dIwhxtBcLdkdRZwlK42NCaEUjZZoPsXZAZ1o.tgK50mj8FJZTM5zCxcVg3w4cOCSM4BvYApzj7YMWHycK14.NY6Y.AP6bW6g0jDIqZlbp2hKSpDfZYBhjsgwJJraXKf2S4sgG6swjXFVVUHGngt2GMQPUZQRsE0_tL9Pz3_h6JeSD9qHWLOVKJWz0z8hdC_F4kiGZj2FRjjSUZp0VLUS8pjkrJdGYrKhC5xwGy8xSFYBE_trVuCFjr8.vhLqBONYkoWvZM2qNX_WZXg_3wTLMqCMrjoCmkvHf7B9.MMVu8tMC4hwDT4wjeyoNoRjIlgKuwE.aGhn',
'Host':'www.hubei.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.hubei.gov.cn/site/hubei/search.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'湖北采集第{i}页列表')
lurl=f'http://www.hubei.gov.cn/igs/front/search.jhtml?position=&timeOrder=&code=872801132c71495bbe5a938f6acff5aa&orderBy=all&pageSize=10&type=%E6%96%87%E4%BB%B6&time=&chnldesc=&pageNumber={i}&aggrFieldName=chnldesc&sortByFocus=true&siteId=50&name=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitename=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitetype=%E7%9C%81%E6%94%BF%E5%BA%9C&searchWord=REITS&6LDjm9Ls=0t3_jtGlqEJQLVtYPg5o4LE8KRsDcOrdhcQJ2gpgbWwP9rQyfChv7ADuy_hXWgy2abOG9jq8_hKyrFekh7IWmLmb9VBbEQh7tULy0_6L3zqkGOSoDWEcli5Ympa58KVMviSIxe_LiYGE'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['page']['content']
for lmsg in datas:
title=lmsg['DOCTITLE']
subtitle=lmsg['FileName']
summary=lmsg['DOCCONTENT']
createDate=''
writeDate=''
pubDate=lmsg['PUBDATE']
source=lmsg['publisher']
durl=lmsg['url']
docNumberStr=''
reNum=lmsg['IdxID']
content=''
siteweb='湖北省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hubeigov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hubeigov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
# opt = webdriver.ChromeOptions()
# opt.add_argument("--ignore-certificate-errors")
# opt.add_argument("--ignore-ssl-errors")
# opt.add_experimental_option("excludeSwitches", ["enable-automation"])
# opt.add_experimental_option('excludeSwitches', ['enable-logging'])
# opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
# chromedriver = r'D:\chrome62\cmdvip\chromedriver.exe'
# driver = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
headers={
'Host':'www.hubei.gov.cn',
'Proxy-Connection':'keep-alive',
'Pragma':'no-cache',
'Cache-Control':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer':'http://www.hubei.gov.cn/zfwj/ezbf/202303/t20230303_4569220.shtml',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'924omrTVcFchT=0ydF5mX9FkIQDMfAAr4A60Yt6sHsXZOzTlm30NLRHm2OwX_YgXaMFBUe3WeNORSf0ZqYHjvBxVL5CXNSWoCOOThArMBpBDzXVdWxVIoA5YBGBLbPUN4CbcQQLZEty.w1MZkgI1pn30uv5STvyCsHLoYGTDHDSIbaURf4XIXzC3fNhxDX.nR5ZWV_HBo9ZAyC5I93.otc4vf7nD6v3Tympw6h2ZUuyAJ0Q7Nes3n0dIB_BIhwCkjyvJibUZt04ggU6XeXnS.qXr2CaM8BJQQ4mdLJ5apGqInkYuNv2GJP1AvL',
}
durl=detailmsg['durl']
try:
dhmsg=reqbase.reqGetHtml(durl,headers)
if dhmsg:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
idx_num=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
class_type=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
dplay_gov=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
pub_date=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
fileNum=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="hbgov-article-content"]')
# contentWithTag=soup.select('div[class="hbgov-article-content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['idx_num']=idx_num
detailmsg['class_type']=class_type
detailmsg['dplay_gov']=dplay_gov
detailmsg['pub_date']=pub_date
detailmsg['fileNum']=fileNum
except Exception as e:
print(e)
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# https://www.henan.gov.cn/
import json
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import xlsxwriter
import openpyxl
import BaseCore
baseCore=BaseCore.BaseCore()
def reqGetHtml(url,header):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
response=requests.get(url=url,headers=header,proxies=proxy,verify=False,timeout=10)
response.encoding=response.apparent_encoding
hcont=response.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostHtml(url,header,data):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
if isinstance(data, str):
res=requests.post(url=url,data=data,headers=header,proxies=proxy,verify=False,timeout=10)
else:
res=requests.post(url=url,data=json.dumps(data),headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostStrHtml(url,header,data):
for i in range(0,3):
try:
res=requests.post(url=url,data=data,headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def createDriver():
chrome_driver = r'D:\Google\Chrome\Application\chrome.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =r'D:\chrome\chromedriver.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def pdwriterXLS(dlist,siteName):
df_out = pd.DataFrame(data=dlist)
df_out.to_excel(siteName+'.xlsx', engine='xlsxwriter', index=False)
# _*_ coding:utf-8 _*_
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Content-Length':'185',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Origin':'https://sheng.so-gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Ch-Ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'Suid':'cf354a807a13d634f76bf167610f9c07',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=4
for i in range(3,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://api.so-gov.cn/s'
data=f'siteCode=5300000033&tab=zcwj&timestamp=1699525503095&wordToken=72df37fd2f1058524e0c7467610d9ab7&page={i}&pageSize=20&qt=REITs&timeOption=0&sort=dateDesc&keyPlace=0&fileType=&toolsStatus=1'
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['search']['searchs']
for myValues in datas:
lmsg=myValues['myValues']
try:
title=lmsg['DRETITLEO']
subtitle=''
summary=lmsg['QUICKDESCRIPTION']
createDate=''
writeDate=''
pubDate=''
source=lmsg['WEBSITENAME']
durl=lmsg['URL']
wenjianhao=lmsg['C3']
suoyinhao=''
content=''
siteweb='云南省人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyinhao':suoyinhao,
}
is_member = rr.sismember('reis_yngov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_yngov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'云南人民政府-政策2')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_gscu_802487706=99519044sn0ozi20; _gscbrs_802487706=1; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1699519045; TrsAccessMonitor=TrsAccessMonitor-1699519056000-2819180807; _gscs_802487706=995190442fewym20|pv:2; Hm_lpvt_b9099e95d08017e30f6285a8b55eb822=1699519714',
'Host':'www.yn.gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情页标题:{detailmsg["title"]}')
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
suoyinhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(1)>dd')[1]
wenjianhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(2)>dd')[1]
pubDate=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(4)>dd')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="trs_editor_view TRS_UEDITOR trs_paper_default trs_web"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="content"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
if not content:
log.info(f'详情内容为空:{durl}')
#contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['suoyinhao']=suoyinhao
detailmsg['wenjianhao']=wenjianhao
detailmsg['pubDate']=pubDate
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
import os import os
from urllib.parse import unquote
import redis import redis
from flask import Flask, request, send_file, render_template, jsonify from flask import Flask, request, send_file, render_template, jsonify
...@@ -71,12 +72,14 @@ def index(): ...@@ -71,12 +72,14 @@ def index():
def get_news(): def get_news():
data=request.form data=request.form
@app.route('/ws/setCookie', methods=['GET']) @app.route('/ws/setCookie', methods=['GET','POST'])
# @cross_origin() # @cross_origin()
def setCookie(): def setCookie():
try: try:
cookie = request.args.get('cookie') # cookie = request.args.get('cookie')
r.sadd('wscookie',cookie) wscookie = request.form.get('cookie')
wscookie = unquote(wscookie)
r.sadd('wscookie',wscookie)
except Exception as e: except Exception as e:
print('error') print('error')
return 'succes' return 'succes'
......
This source diff could not be displayed because it is too large. You can view the blob instead.
裁判文书网修改:
1.由于header信息的不全导致之前请求时获取不到需要的列表信息
2.请求中的参数是根据js代码依照时间生成的,
3.信息的解密采用了DES3的方式进行处理
4.裁判文书网采集流程修改
使用浏览器定时刷新网页的方式获取cookie信息,放到redis中。在每次请求是从redis中拉取一个cookie信息。
5.测试目前不确定封号的策略需要进行测试
一个电脑只能使用本机的cookie信息进行请求访问,不能切换其它的账号否则会导致其它账号被封。
索引(Index)
索引(Index)
索引就是一类文档的集合,类似于关系型数据库中的表。索引由其名称进行标识,每个索引名称必须是小写。
文档(Document)
Index中单条记录称为文档,等同于关系型数据库表中的行。
字段(Field)
json结构的字段,等同于关系型数据库表中的列。
映射(Mapping)
Mapping是处理数据的方式和规则方面做一些限制,如:某个字段的数据类型、默认值、分析器、是否被索引等等,都是映射里可以设置的。
分片(Shards)
一个索引可以存储超过单个节点硬件限制的大量数据,相当于分表的概念。
ES提供了将索引划分成多份的能力,每一份称之为分片。
当创建一个索引的时候,可以指定想要的分片数量。
允许水平分割/扩展内容容量;允许在分片之上进行分布式的、并行的操作,进而提高性能/吞吐量。
副本(Replicas)
在分片/节点失败的情况下,提供了高可用性。
复制分片从不与原/主要分片置于同一节点上是非常重要的。
扩展搜索量/吞吐量,因为搜索可以在所有副本上并行运行。
from elasticsearch import Elasticsearch
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
index_name='test_data'
def main():
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
if __name__ == '__main__':
main()
# 创建索引
def create_index():
# 定义mapping body
body_index = {
'mappings': {
'properties': {
'name': {
'type': 'keyword'
},
'age': {
'type': 'long'
},
'tags': {
'type': 'text'
}
}
},
'settings': {
'index': {
'number_of_shards': '3',
'number_of_replicas': '0'
}
}
}
# 创建index
res = es.indices.create(index=index_name, body=body_index, ignore=400)
def instert_data():
person1 = {
'name': '张三',
'age': 18,
'tags': '勤奋学习十载寒窗,凿壁借光,囊萤映雪,手不释卷,有良好的表达能力。有耐心心态好,善于维系客户关系。果断热情勇敢孤僻活力,思想成熟能够独立工作。'
}
res = es.index(index=index_name, body=person1)
from elasticsearch import helpers
def instert_bach():
insert_infos = []
person2 = {
'_index': index_name,
'name': '李四',
'age': 20,
'tags': '有极强的领导艺术,公正严明铁面无私,公私分明。关心他人无微不至,体贴入微。精力充沛,并有很强的事业心。气吞山河正气凛然,善于同各种人员打交道。'
}
person3 = {
'_index': index_name,
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
insert_infos.append(person2)
insert_infos.append(person3)
helpers.bulk(client=es, actions=insert_infos)
def del_index():
# 删除index
res = es.indices.delete(index=index_name, ignore=[400])
def del_doc_byid():
# 按id删除
res = es.delete(index=index_name, id='bKTgXYUBfH4USN9RFMOh')
def del_by_condation():
# 按条件删除
body = {
'query': {
'match': {
'name': '张三'
}
}
}
res = es.delete_by_query(index=index_name, body=body, ignore=[400, 404])
# index() 方法完成两个操作,如果数据不存在,那就执行插入操作,如果已经存在,那就执行更新操作。
# index实现更新时,body中必须写入全部字段,否则未包含的字段会被置为空。
def index_update_doc():
body = {
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
res = es.index(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def update_doc():
body = {
'doc': {
'name': '王五'
}
}
es.update(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def select_info():
# 查看ES中索引的信息
index_info = es.indices.get('*')
# 查看索引的名称
index_names = index_info.keys()
index_name = 'es_index'
print(es.indices.exists(index_name))
doc_count = es.count(index=index_name)
def query_by_id():
body = {
'query': {
'match': {
'_id': 'baTgXYUBfH4USN9RFMOh'
}
}
}
res = es.search(index=index_name, body=body)
def query_by_filed():
body = {
'query': {
'match': {
'age': 20
}
},
'_source': ['name', 'tags']
}
res = es.search(index=index_name, body=body)
def query_by_sort():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
}
}
res = es.search(index=index_name, body=body)
def query_by_range():
body = {
'query': {
'range': {
'age': {
'gt': 18,
'lte': 20
}
}
}
}
res = es.search(index=index_name, body=body)
def query_by_page():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
},
'from': 0,
'size': 1
}
res = es.search(index=index_name, body=body)
def quere_by_paser():
body = {
"query": {
"match_phrase": {
"tags": "耐心"
}
}
}
res = es.search(index=index_name, body=body)
def query_by_mult():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "张三"
}
},
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
def query_by_not():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "王五"
}
}
],
'must_not': [
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
...@@ -486,8 +486,8 @@ class JrttnewsSpider(object): ...@@ -486,8 +486,8 @@ class JrttnewsSpider(object):
def extractorMsg(self,url,title): def extractorMsg(self,url,title):
content='' content=''
contentWithTag='' contentWithTag=''
lang='' lang='cn'
lang=self.detect_language(title) # lang=self.detect_language(title)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
try: try:
# raw_html=self.detailHtml(url) # raw_html=self.detailHtml(url)
......
...@@ -48,6 +48,7 @@ class SougouSpider(object): ...@@ -48,6 +48,7 @@ class SougouSpider(object):
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue() self.qtitle = Queue()
self.qurl = Queue() self.qurl = Queue()
...@@ -373,9 +374,9 @@ class SougouSpider(object): ...@@ -373,9 +374,9 @@ class SougouSpider(object):
def extractorMsg(self,url,title): def extractorMsg(self,url,title):
content='' content=''
contentWithTag='' contentWithTag=''
lang='' lang='cn'
try: try:
lang=self.detect_language(title) # lang=self.detect_language(title)
raw_html=self.webDriver(url) raw_html=self.webDriver(url)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html) article=sm.extract_by_html(raw_html)
......
...@@ -235,7 +235,7 @@ if __name__ == '__main__': ...@@ -235,7 +235,7 @@ if __name__ == '__main__':
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(sougouTaskJob.runSpider, data) for data in kwList] results = [executor.submit(sougouTaskJob.runLocSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
for future in concurrent.futures.as_completed(results): for future in concurrent.futures.as_completed(results):
try: try:
......
...@@ -7,7 +7,7 @@ import openpyxl ...@@ -7,7 +7,7 @@ import openpyxl
from urllib.parse import urlparse from urllib.parse import urlparse
# 打开Excel文件 # 打开Excel文件
workbook = openpyxl.load_workbook('2500url.xlsx') workbook = openpyxl.load_workbook(r'C:\Users\WIN10\Desktop\aa\qiye.xlsx')
# 获取工作表对象 # 获取工作表对象
worksheet = workbook.active worksheet = workbook.active
...@@ -16,8 +16,9 @@ qiyedatas=[] ...@@ -16,8 +16,9 @@ qiyedatas=[]
# 遍历工作表的行 # 遍历工作表的行
for row in worksheet.iter_rows(values_only=True): for row in worksheet.iter_rows(values_only=True):
qiyemsg={ qiyemsg={
'url':row[0], '序号':row[0],
'exist':row[1], '企业名称':row[0],
'网址':row[1],
} }
qiyedatas.append(qiyemsg) qiyedatas.append(qiyemsg)
...@@ -31,8 +32,9 @@ sql1 = """select id, info_source_code, web_site_name, site_name , site_uri from ...@@ -31,8 +32,9 @@ sql1 = """select id, info_source_code, web_site_name, site_name , site_uri from
cont=1; cont=1;
qynot=[] qynot=[]
qyin=[] qyin=[]
qynn=[]
for qy in qiyedatas: for qy in qiyedatas:
name=qy['url'] name=qy['网址']
if name is None: if name is None:
qy['exist']=0 qy['exist']=0
qyin.append(qy) qyin.append(qy)
...@@ -42,6 +44,10 @@ for qy in qiyedatas: ...@@ -42,6 +44,10 @@ for qy in qiyedatas:
qyin.append(qy) qyin.append(qy)
continue continue
try: try:
parsed_url = urlparse(name)
domain = parsed_url.netloc
if domain.startswith("www."):
name = domain[4:]
sql2=sql1.replace("[name]",name) sql2=sql1.replace("[name]",name)
cursor.execute(sql2) cursor.execute(sql2)
except Exception as e: except Exception as e:
...@@ -54,14 +60,29 @@ for qy in qiyedatas: ...@@ -54,14 +60,29 @@ for qy in qiyedatas:
qy['exist']=0 qy['exist']=0
qyin.append(qy) qyin.append(qy)
else: else:
result_data
# qyin.append(qy)
for row2 in tqdm(result_data):
try:
rd = {'id': row2[0],
'编码': row2[1],
'网站名称': row2[2],
'栏目名称': row2[3],
'栏目地址': row2[4],
'企业名称': qy['企业名称']
}
qynn.append(rd)
except Exception as e:
print(e)
print("查询失败!!"+sql2)
cont+=1 cont+=1
print(cont) print(cont)
qy['exist']=1 qy['exist']=1
qyin.append(qy) qyin.append(qy)
df_out = pd.DataFrame(data=qyin) df_out = pd.DataFrame(data=qyin)
df_out.to_excel('url企业情况在平台中有数据.xlsx', engine='xlsxwriter', index=False) df_out.to_excel('url企业名单.xlsx', engine='xlsxwriter', index=False)
df_out = pd.DataFrame(data=qynot) df_out = pd.DataFrame(data=qynn)
df_out.to_excel('url企业情况在平台中没有数据.xlsx', engine='xlsxwriter', index=False) df_out.to_excel('url企业情况在平台中没有数据.xlsx', engine='xlsxwriter', index=False)
......
#coding=utf-8 #coding=utf-8
...@@ -280,7 +280,7 @@ class BaiduSpider(object): ...@@ -280,7 +280,7 @@ class BaiduSpider(object):
hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0] hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
hasnext = hasnext.strip() hasnext = hasnext.strip()
timeFlag=False timeFlag=False
while hasnext == '下一页 >': while '下一页' in hasnext:
try: try:
if self.page_num==5: if self.page_num==5:
break break
...@@ -451,32 +451,6 @@ class BaiduSpider(object): ...@@ -451,32 +451,6 @@ class BaiduSpider(object):
break break
# time.sleep(5) # time.sleep(5)
# def getDetailmsg(self,detailhtml,detailmsg):
# try:
# detailurl=detailmsg['detailUrl']
# article_content=self.paserDetail(detailhtml,detailurl)
# content=article_content['content']
# contentWithTag=article_content['body_html']
# except Exception as e:
# self.logger.info('内容抽取失败')
# content=''
# contentWithTag=''
# currentdate=self.getNowDate()
# kword=self.searchkw
# publishtime=detailmsg['publishTag']
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
# detailmsg={
# 'title':detailmsg['title'],
# 'source':detailmsg['sourceTag'],
# 'detailurl':detailurl,
# 'content':content,
# 'contentHtml':contentWithTag,
# 'publishtime':publishDate,
# 'currentdate':currentdate,
# 'kword':kword
# }
# return detailmsg
def getProcessitem(self,bdetail): def getProcessitem(self,bdetail):
nowDate=self.getNowDate() nowDate=self.getNowDate()
......
114.116.108.171 启动了8个动态采集
114.115.234.116 启动了4个动态采集和4个国外网站采集
114.115.218.248 启动了4个动态采集和3个验证服务
114.115.162.99 启动了特定专题采集 8个服务
114.115.221.202 为中科软城市采集相关服务
HK 159.138.150.155 启动了3个国外网站验证服务
HK 94.74.96.195 启动雅虎财经财务数据采集,
114.115.153.6 老平台研究中心,央企舆情采集
114.116.122.247 老平台央企舆情,评价中心相关采集和评价中心央企舆情新平台采集
49.4.24.191 服务器系统不能进行复制粘贴环境
到期 114.116.48.72 需要迁移的服务 国外采集 4个 (迁移到116上,116的动态采集服务迁到171上)
到期 114.115.235.92 建材,机械舆情的测试采集服务 (迁到152.6服务器上)
192.168.1.239 启动了静态采集服务4个
192.168.1.240 启动了静态采集服务4个
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论