提交 497088ef 作者: 刘伟刚

代码修改11

上级 3154b028
# -*- coding:utf-8 -*-
import datetime
import time
......@@ -26,11 +26,11 @@ def page_list():
header = {
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'72',
'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
......@@ -40,9 +40,9 @@ def page_list():
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh'
'Accept-Language':'zh-CN,zh;q=0.9'
}
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,445):
......@@ -79,23 +79,23 @@ def detailpaser(dmsg):
hh={
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'14',
'Content-Length':'25',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/detail/detail',
'x-yss-page':'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code':'4400',
'Accept':'*/*',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh'
'Accept-Language':'zh-CN,zh;q=0.9'
}
try:
durl=dmsg['url']
......
......@@ -74,19 +74,7 @@ class HgDownFile(object):
return cookie
#请求下载文件
def reqDownFile(self,data):
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control':'max-age=0',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'stats.customs.gov.cn',
'Origin':'http://stats.customs.gov.cn',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
'Cookie': self.getcookie()
}
data=data
proxy={}
# response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
......@@ -95,19 +83,36 @@ class HgDownFile(object):
while statuscode != 200:
# time.sleep(5)
try:
# header={
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding':'gzip, deflate',
# 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# 'Cache-Control':'max-age=0',
# 'Content-Type':'application/x-www-form-urlencoded',
# 'Host':'stats.customs.gov.cn',
# 'Origin':'http://stats.customs.gov.cn',
# 'Proxy-Connection':'keep-alive',
# 'Upgrade-Insecure-Requests':'1',
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
# 'Cookie': self.getcookie()
# }
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control':'max-age=0',
'Content-Length':'306',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'stats.customs.gov.cn',
'Origin':'http://stats.customs.gov.cn',
'Proxy-Connection':'keep-alive',
#'Referer':'http://stats.customs.gov.cn/queryData/queryDataList?pageNum=1&codeLength=8&currentStartTime=202203&currentEndTime=202309&currentDateBySource=202309&selectTableState=3&orderType=CODE%20ASC%20DEFAULT&iEType=0&currencyType=usd&year=2022&startMonth=1&endMonth=11&monthFlag=&unitFlag=false&unitFlag1=false&outerField1=&outerField2=CODE_TS&outerField3=&outerField4=&outerValue1=&outerValue2=&outerValue3=&outerValue4=',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
'Cookie': self.getcookie()
}
data_str = '&'.join([f"{key}={value}" for key, value in data.items()])
response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
# response.encoding = response.apparent_encoding
response.encoding = 'GB2312'
......@@ -218,6 +223,7 @@ class HgDownFile(object):
# 2022年 1-1 202202 2 2022年 1月之前数据是 2
# 2022年 1-2 202202 3 2022年的累计数据是 3
# 2022年 2-2 202202 1 2022年 1月之后数据是 1
selectTableState=2
if year<2022:
selectTableState= 2 #202202前的数据为2 后的数据是1
else:
......@@ -225,9 +231,11 @@ class HgDownFile(object):
e=int(endMonth)
if year==2022 and s<e: #2022年累计数据单独设置参数
selectTableState= 3
if e==2:
selectTableState= 2
elif year==2022 and e==1:
selectTableState= 2
else:
elif year==2022 and s==e:
selectTableState= 1 #202202前的数据为2 后的数据是1
param={
'pageSize': 10,
......@@ -237,9 +245,9 @@ class HgDownFile(object):
'startMonth': startMonth,
'endMonth': endMonth,
'monthFlag':'',
'unitFlag': False,
'unitFlag1': False,
'codeLength': '8',
'unitFlag': True,
'unitFlag1': True,
'codeLength': 8,
'outerField1': outerField1,
'outerField2':'',
'outerField3':'',
......@@ -250,24 +258,26 @@ class HgDownFile(object):
'outerValue4':'',
'orderType': 'CODE ASC DEFAULT',
'selectTableState': selectTableState, #202201前的数据为2 后的数据是1
'currentStartTime': '202202',
'currentStartTime': 202203,
}
return param
#联合查询字段的参数设置
def setcodesAndProductparam(self,iEType,currencyType,year,startMonth,endMonth,outerField1,filedCode):
selectTableState= 1 #默认是1
if year<2022:
selectTableState= 2 #202202前的数据为2 后的数据是1
selectTableState= 2 #202203 前的数据为2
else:
s=int(startMonth)
e=int(endMonth)
if year==2022 and s<e: #2022年累计数据单独设置参数
if year==2022 and s<e: #2022年累计数据参数是3
selectTableState= 3
if e==2:
selectTableState= 2
elif year==2022 and e==1:
selectTableState= 2
else:
selectTableState= 1 #202202前的数据为2 后的数据是1
selectTableState= 2 #202203 1月的数据单月的参数是2
elif year==2022 and s==e:
selectTableState= 1 #202203除1月的数据单月的参数是1
param={
'pageSize': 10,
'iEType': iEType,
......@@ -276,8 +286,8 @@ class HgDownFile(object):
'startMonth': startMonth,
'endMonth': endMonth,
'monthFlag':'',
'unitFlag': False,
'unitFlag1': False,
'unitFlag': True,
'unitFlag1': True,
'codeLength': '8',
'outerField1': outerField1,
'outerField2':'CODE_TS',
......@@ -289,7 +299,7 @@ class HgDownFile(object):
'outerValue4':'',
'orderType': 'CODE ASC DEFAULT',
'selectTableState': selectTableState,
'currentStartTime': '202202',
'currentStartTime': 202203,
}
return param
......
......@@ -6,7 +6,7 @@ import json
import pymysql
from pyquery import PyQuery as pq
from flask_cors import cross_origin
from urllib.parse import unquote
'''
......@@ -71,12 +71,15 @@ def index():
def get_news():
data=request.form
@app.route('/task/setCookie', methods=['GET'])
# @cross_origin()
@app.route('/task/setCookie', methods=['GET','POST'])
@cross_origin()
def setCookie():
try:
cookie = request.args.get('cookie')
r.sadd('hgcookie',cookie)
# cookie = request.args.get('cookie')
hgcookie = request.form.get('cookie')
hgcookie = unquote(hgcookie)
r.sadd('hgcookie',hgcookie)
print(f'setCookie添加cookie成功到redis{hgcookie}')
except Exception as e:
print('error')
return 'succes'
......@@ -100,7 +103,7 @@ def getCookieSize():
return jsonify(data)
@app.route('/task/getHtml', methods=['POST'])
# @cross_origin()
@cross_origin()
def getnewMonth():
try:
html = request.form.get('html')
......
import os
from urllib.parse import unquote
import redis
from flask import Flask, request, send_file, render_template, jsonify
......@@ -71,12 +72,14 @@ def index():
def get_news():
data=request.form
@app.route('/ws/setCookie', methods=['GET'])
@app.route('/ws/setCookie', methods=['GET','POST'])
# @cross_origin()
def setCookie():
try:
cookie = request.args.get('cookie')
r.sadd('wscookie',cookie)
# cookie = request.args.get('cookie')
wscookie = request.form.get('cookie')
wscookie = unquote(wscookie)
r.sadd('wscookie',wscookie)
except Exception as e:
print('error')
return 'succes'
......
This source diff could not be displayed because it is too large. You can view the blob instead.
裁判文书网修改:
1.由于header信息的不全导致之前请求时获取不到需要的列表信息
2.请求中的参数是根据js代码依照时间生成的,
3.信息的解密采用了DES3的方式进行处理
4.裁判文书网采集流程修改
使用浏览器定时刷新网页的方式获取cookie信息,放到redis中。在每次请求是从redis中拉取一个cookie信息。
5.测试目前不确定封号的策略需要进行测试
一个电脑只能使用本机的cookie信息进行请求访问,不能切换其它的账号否则会导致其它账号被封。
......@@ -486,8 +486,8 @@ class JrttnewsSpider(object):
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang=self.detect_language(title)
lang='cn'
# lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
# raw_html=self.detailHtml(url)
......
......@@ -48,6 +48,7 @@ class SougouSpider(object):
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
......@@ -373,9 +374,9 @@ class SougouSpider(object):
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang='cn'
try:
lang=self.detect_language(title)
# lang=self.detect_language(title)
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
......
......@@ -235,7 +235,7 @@ if __name__ == '__main__':
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(sougouTaskJob.runSpider, data) for data in kwList]
results = [executor.submit(sougouTaskJob.runLocSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
......
......@@ -7,7 +7,7 @@ import openpyxl
from urllib.parse import urlparse
# 打开Excel文件
workbook = openpyxl.load_workbook('2500url.xlsx')
workbook = openpyxl.load_workbook(r'C:\Users\WIN10\Desktop\aa\qiye.xlsx')
# 获取工作表对象
worksheet = workbook.active
......@@ -16,8 +16,9 @@ qiyedatas=[]
# 遍历工作表的行
for row in worksheet.iter_rows(values_only=True):
qiyemsg={
'url':row[0],
'exist':row[1],
'序号':row[0],
'企业名称':row[0],
'网址':row[1],
}
qiyedatas.append(qiyemsg)
......@@ -31,8 +32,9 @@ sql1 = """select id, info_source_code, web_site_name, site_name , site_uri from
cont=1;
qynot=[]
qyin=[]
qynn=[]
for qy in qiyedatas:
name=qy['url']
name=qy['网址']
if name is None:
qy['exist']=0
qyin.append(qy)
......@@ -42,6 +44,10 @@ for qy in qiyedatas:
qyin.append(qy)
continue
try:
parsed_url = urlparse(name)
domain = parsed_url.netloc
if domain.startswith("www."):
name = domain[4:]
sql2=sql1.replace("[name]",name)
cursor.execute(sql2)
except Exception as e:
......@@ -54,14 +60,29 @@ for qy in qiyedatas:
qy['exist']=0
qyin.append(qy)
else:
result_data
# qyin.append(qy)
for row2 in tqdm(result_data):
try:
rd = {'id': row2[0],
'编码': row2[1],
'网站名称': row2[2],
'栏目名称': row2[3],
'栏目地址': row2[4],
'企业名称': qy['企业名称']
}
qynn.append(rd)
except Exception as e:
print(e)
print("查询失败!!"+sql2)
cont+=1
print(cont)
qy['exist']=1
qyin.append(qy)
df_out = pd.DataFrame(data=qyin)
df_out.to_excel('url企业情况在平台中有数据.xlsx', engine='xlsxwriter', index=False)
df_out.to_excel('url企业名单.xlsx', engine='xlsxwriter', index=False)
df_out = pd.DataFrame(data=qynot)
df_out = pd.DataFrame(data=qynn)
df_out.to_excel('url企业情况在平台中没有数据.xlsx', engine='xlsxwriter', index=False)
......
#coding=utf-8
#coding=utf-8
......@@ -280,7 +280,7 @@ class BaiduSpider(object):
hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
hasnext = hasnext.strip()
timeFlag=False
while hasnext == '下一页 >':
while '下一页' in hasnext:
try:
if self.page_num==5:
break
......@@ -451,32 +451,6 @@ class BaiduSpider(object):
break
# time.sleep(5)
# def getDetailmsg(self,detailhtml,detailmsg):
# try:
# detailurl=detailmsg['detailUrl']
# article_content=self.paserDetail(detailhtml,detailurl)
# content=article_content['content']
# contentWithTag=article_content['body_html']
# except Exception as e:
# self.logger.info('内容抽取失败')
# content=''
# contentWithTag=''
# currentdate=self.getNowDate()
# kword=self.searchkw
# publishtime=detailmsg['publishTag']
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
# detailmsg={
# 'title':detailmsg['title'],
# 'source':detailmsg['sourceTag'],
# 'detailurl':detailurl,
# 'content':content,
# 'contentHtml':contentWithTag,
# 'publishtime':publishDate,
# 'currentdate':currentdate,
# 'kword':kword
# }
# return detailmsg
def getProcessitem(self,bdetail):
nowDate=self.getNowDate()
......
114.116.108.171 启动了8个动态采集
114.115.234.116 启动了4个动态采集和4个国外网站采集
114.115.218.248 启动了4个动态采集和3个验证服务
114.115.162.99 启动了特定专题采集 8个服务
114.115.221.202 为中科软城市采集相关服务
HK 159.138.150.155 启动了3个国外网站验证服务
HK 94.74.96.195 启动雅虎财经财务数据采集,
114.115.153.6 老平台研究中心,央企舆情采集
114.116.122.247 老平台央企舆情,评价中心相关采集和评价中心央企舆情新平台采集
49.4.24.191 服务器系统不能进行复制粘贴环境
到期 114.116.48.72 需要迁移的服务 国外采集 4个 (迁移到116上,116的动态采集服务迁到171上)
到期 114.115.235.92 建材,机械舆情的测试采集服务 (迁到152.6服务器上)
192.168.1.239 启动了静态采集服务4个
192.168.1.240 启动了静态采集服务4个
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论