提交 862e97ab 作者: 薛凌堃

1/31

上级 1d1053c8
...@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header): ...@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break break
except Exception as e: except Exception as e:
time.sleep(3) time.sleep(3)
log.info(e)
continue continue
if page_size < 1: if page_size < 1:
...@@ -206,7 +207,8 @@ def download(data, order_by,header): ...@@ -206,7 +207,8 @@ def download(data, order_by,header):
come = data['come'] come = data['come']
except: except:
come = '' come = ''
if publishDate < '2024-01-29':
return
tf_url = add_check_url(sourceAddress) tf_url = add_check_url(sourceAddress)
if tf_url: if tf_url:
dic_result = { dic_result = {
...@@ -1726,12 +1728,12 @@ if __name__ == '__main__': ...@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku() # qianyanzhishiku()
# except Exception as e: # except Exception as e:
# pass # pass
try: # try:
log.info('shijiejingjiluntan') # log.info('shijiejingjiluntan')
shijiejingjiluntan() # shijiejingjiluntan()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
# try: # try:
# log.info('dongfangcaifu') # log.info('dongfangcaifu')
# dongfangcaifu() # dongfangcaifu()
...@@ -1749,31 +1751,31 @@ if __name__ == '__main__': ...@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
# try: try:
# log.info('dongfangcaifu4') log.info('dongfangcaifu4')
# dongfangcaifu4() dongfangcaifu4()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu5') log.info('dongfangcaifu5')
# dongfangcaifu5() dongfangcaifu5()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu6') log.info('dongfangcaifu6')
# dongfangcaifu6() dongfangcaifu6()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
#
# try: try:
# log.info('dongfangcaifu7') log.info('dongfangcaifu7')
# dongfangcaifu7() dongfangcaifu7()
# except Exception as e: except Exception as e:
# log.info(e) log.info(e)
# pass pass
import requests
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
def two_dfsm_mtgc():
info_list = []
"""
地方扫描
"""
url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for url in url_list:
res = requests.get(url=url,headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
# for page in range(378,int(pages)+1):
for page in range(1,378):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else:
url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0019-test', href)
if flag:
log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except Exception as e:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result =result.find('p').text
pub_source = result.split('发布时间:')[0].replace('文章来源:','').strip()
pub_time = result.split('发布时间:')[1]
# print(pub_source,pub_time)
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian').find_all('span')
try:
pub_source = str(result[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source =''
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
info_code = 'IN-20240129-0019'
result_dict = {
'id': '',
'sid': '1751849444877144065',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
two_dfsm_mtgc()
\ No newline at end of file
import json
import sys
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
}
#国资要闻
def gzyw():
info_list = []
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
# pages = soup.find('td',id='pag_4278129')
pages = soup.find('td', class_='pages')
pages_tag = pages['id'].split('pag_')[1]
pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
# print(pages)
for page in range(1, int(pages)+1):
log.info(f'==============开始采集第{page}页===============')
if page == 1:
url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try:
res = requests.get(url=url, headers=headers)
except:
continue
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('span', id=f'comp_{pages_tag}')
if li_list:
li_list = li_list.find_all('li')
else:
li_list = soup.find_all('li')
for li in li_list:
# print(type(li))
if len(li):
a = li.find('a')
# print(a)
href = a['href']
if 'http' in href:
href = href
else:
href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
# print(href)
try:
flag = r.sismember('IN-20240129-0002-test', href)
if flag:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except Exception as e:
continue
try:
title = a['title']
except:
title = ''
# print(title)
try:
res_href = requests.get(url=href,headers=headers,verify=False)
except:
continue
res_href.encoding = res_href.apparent_encoding
href_text = res_href.text
i_soup = BeautifulSoup(href_text,'html.parser')
result = i_soup.find(class_='zsy_cotitle')
try:
if result:
result_ =result.find('p').text
pub_source = result_.split('发布时间:')[0].replace('文章来源:', '').strip()
pub_time = result_.split('发布时间:')[1]
# print(pub_source,pub_time)
if title == '':
result.find('p').decompose()
title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
try:
i_soup.find('div', id='div_div').decompose()
i_soup.find('div', id='qr_container').decompose()
except:
pass
contentWithTag = str(i_soup.find(class_='zsy_comain'))
content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
else:
result = i_soup.find(class_='lyshijian')
if result:
result_ = result.find_all('span')
try:
pub_source = str(result_[0]).split('文章来源:')[1].split('</span>')[0].strip()
pub_time = str(result_[1]).split('发布时间:')[1].split('</span>')[0].strip()
except:
pub_time = str(result_[0]).split('发布时间:')[1].split('</span>')[0].strip()
pub_source = ''
if title == '':
result.find('p').decompose()
title = result.text.strip()
contentWithTag = str(i_soup.find(class_='articlecontent'))
content = str(i_soup.find(class_='articlecontent').text)
else:
result = i_soup.find(class_='pages-date')
pub_source = result.find('span').text.replace('来源:', '').strip()
pub_time = result.text
pub_time = pub_time.split('来源')[0].strip()
contentWithTag = str(i_soup.find(class_='pages_content'))
content = str(i_soup.find(class_='pages_content').text)
# content = str(i_soup.find(class_='articlecontent').text)
if title == '':
log.info(f'title为空----{page}--{title}--{href}')
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code = 'IN-20240129-0002'
result_dict = {
'id':'',
'sid':'1751810519211053058',
'title': title,
'organ': pub_source,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source':16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': pub_time,
'sourceAddress': href,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
except:
continue
if __name__ == "__main__":
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
gzyw()
\ No newline at end of file
"""
中证智能财讯
"""
import json
import requests
from bs4 import BeautifulSoup
def zzcx():
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Length': '56',
'Content-Type': 'application/json;charset=UTF-8',
'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Origin': 'https://zzcx.cs.com.cn',
'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload = json.dumps(payload)
result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json)
pages = result_json['data']['pages']
for page in range(1, int(pages + 1)):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers)
records = datas.json()['data']['records']
for news in records:
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
if __name__ == "__main__":
zzcx()
\ No newline at end of file
...@@ -85,7 +85,8 @@ class ClassTool(): ...@@ -85,7 +85,8 @@ class ClassTool():
'来源': dic_news['labels'][0]['relationName'], '来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'], '创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100], '带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate'] '发布时间': dic_news['publishDate'],
'标题': dic_news['title']
} }
self.db_storage.insert_one(aaa_dic) self.db_storage.insert_one(aaa_dic)
......
...@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore ...@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
# #
# code = use_ocr(out_img_path) # code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code) # 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import datetime
import json
import requests import requests
headers = { import pymongo
# 'Accept': '*/*', from base import BaseCore
# 'Accept-Encoding': 'gzip, deflate, br', baseCore = BaseCore.BaseCore()
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8', log = baseCore.getLogger()
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com', db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
# 'Pragma': 'no-cache', '数据源_0504']
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors', datas = db_storage.find({'postCode':'2'}).limit(5)
# 'Sec-Fetch-Site': 'same-site', for data in datas:
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', title = data['titleForeign']
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"', contentWithTag = data['richTextForeign']
# 'sec-ch-ua-mobile': '?0', summary = data['contentForeign']
# 'sec-ch-ua-platform': '"Windows"' dic_info = {
} 'title':title,
url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D" 'summary':summary,
'contentWithTag':contentWithTag
}
# res = requests.get(url).text[1:-1] headers = {
res = requests.get(url=url, headers=headers) 'Content-Type': 'application/json',
with open('./a.pdf','wb') as f: }
f.write(res.content) dic_info_ = json.dumps(dic_info)
\ No newline at end of file # print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论