提交 b52e4502 作者: 薛凌堃

2/26

上级 ca40e9aa
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
import pandas as pd
import redis
def putCom():
com_list = ['91210000558190456G', '914200001000115161', '911100007109310534', '9111000071093123XX',
'91110000100017643K', '91110000100018267J', '91110000MA01P657XY', '91230100127057741M',
'91440300190346175T', 'ZZSN22083000000003', '91110000400000720M', '911100001055722912',
'91110000100005220B', '911100001000094165', '91310000132200821H', '911100001000128855',
'91110000710924910P', '91110000710924929L', '911100007109225442', '9111000071092649XU',
'91310000MA1FL70B67', '911100007109311097', '912201011239989159', '911100007178306183',
'91310000MA7ALG04XG', '91110000100017707H', '91110000710929498G', '91110000100010249W',
'9151000062160427XG', '91310000MA1FL4B24G', '91110000400001889L', '9144030010001694XX',
'91110000100000825Q', '91110000100006194G', '91110000717828315T', '91110000100001043E',
'91110000MA005UCQ5P', '91110000710935732K', '91110000710930392Y', '91110000710930296M',
'911100007109303176', '91110000710925243K', '91110000100014071Q', '91110000100009563N',
'9111000071093107XN', '9111000010001002XD', '91110000100001852R', '91110000100001625L',
'911100001000080343', '91110000400008060U', '91110000101699383Q', '91110000100000489L',
'9111000071092868XL', '91110000100001035K', '911100004000011410', '91110000710933809D',
'91110000100010310K', '91133100MABRLCFR5Q', '91110000MA001HYK9X', '911100001000016682',
'911100007109279199', '12100000400010275N', '91110000710935636A', '91110000100024800K',
'9144000076384341X8', '91440000100005896P', '91110000MA01W8B394', '91110000717830650E',
'91110000100003057A', 'ZZSN22061600000001', '91310000MA1FL0LX06', '9111000010169286X1',
'91110000100010433L', '91110000100010660R', '91110000102016548J', '91110000100001676W',
'9111000071092200XY', '91133100MA0G9YKT8B', '9111000010000093XR', '91110000100006485K',
'91360702MA7FK4MR44', '91420100MA4L0GG411', '91110000101625149Q', '12100000400006022G',
'912302001285125661', '91110000100005888C', '911100007109250324', '91110000100024915R',
'9111000040000094XW', '91310000MA1FL1MMXL', '91110000100015058K', '91110000710929930X',
'91133100MA0GBL5F38', '9111000010000085X6', '91110000101100414N']
df = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx')
# 连接到Redis数据库
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
for i in range(len(df)):
social_code = df['social_code'][i]
com_name = df['name'][i]
# print(social_code)
if social_code in com_list:
pass
else:
if 'ZZSN' in social_code or 'ZD' in social_code:
continue
else:
item = social_code + '|' + com_name
r.rpush('UpdateBasdeInfo:SocialCode_CompanyName', item)
def putCom_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每个月执行一次
scheduler.add_job(putCom, 'cron', day=1, hour=0, minute=0)
try:
# redisPushData # 定时开始前执行一次
# putCom()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
if __name__ == '__main__':
putCom_task()
\ No newline at end of file
import pandas as pd
# from pandas import DataFrame as df
import pymysql
import redis import redis
# 连接到Redis cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6) r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
with cnx.cursor() as cursor:
select = """select relationName, relationId from klb_company"""
cursor.execute(select)
results = cursor.fetchall()
for result in results:
name = result[0]
xydm = result[1]
item = f'{name}|{xydm}'
r.rpush('SousuoBaidu:companyname', cell_value)
# 列表名称
list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements = r.lrange(list_name, 0, -1)
# 遍历列表中的元素
for element in elements:
# 获取元素在列表中的数量
count = r.lrem(list_name, 0, element)
# 如果数量大于1,说明有重复值,删除多余的重复值
if count > 1:
r.lrem(list_name, count - 1, element)
# 打印处理后的列表
print(r.lrange(list_name, 0, -1))
# 中央全面深化改革委员会会议 # 中央全面深化改革委员会会议
import json import json
import sys
import time import time
import redis
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from kafka import KafkaProducer from kafka import KafkaProducer
headers = { sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
...@@ -26,22 +32,50 @@ headers = { ...@@ -26,22 +32,50 @@ headers = {
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"' 'sec-ch-ua-platform': '"Windows"'
} }
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'news.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__": if __name__ == "__main__":
# 中央全面深化改革委员会会议 # 中央全面深化改革委员会会议
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议 # 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/'] # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/'] url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
for url in url_list:
request = requests.get(url=url, headers=headers) request = requests.get(url=url, headers=header)
soup = BeautifulSoup(request.content, 'html.parser') soup = BeautifulSoup(request.content, 'html.parser')
# print(soup)
request.encoding = request.apparent_encoding request.encoding = request.apparent_encoding
# print(soup) # print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list') # info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li') info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
for ul in ul_list: flag = 1
for info_html in info_html_list:
if flag == 1:
info_code = 'IN-20230816-0004'
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
info_code = 'IN-20230816-0005'
ul_list = info_html.find('ul', class_='ul_list').find_all('li')
for ul in ul_list[::-1]:
publishDate_ = str(ul.find('span').text) publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日") date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d') publishDate = date_obj.strftime('%Y-%m-%d')
...@@ -51,18 +85,27 @@ if __name__ == "__main__": ...@@ -51,18 +85,27 @@ if __name__ == "__main__":
newsUrl = ul.find('a')['href'] newsUrl = ul.find('a')['href']
summary = ul.find('a').text summary = ul.find('a').text
# todo: 链接判重 # todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers) try:
flag = r.sismember(info_code, newsUrl)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
news_soup = BeautifulSoup(news_request.content, 'html.parser') news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup) # print(news_soup)
try:
title = news_soup.find('h1', class_='big_title').text title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word') contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/': except Exception as e:
sid = '1691633319715676162' log.error(f'解析网页出错{newsUrl}')
else: continue
sid = '1691633869186277378'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={ dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)), 'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title, 'title': title,
...@@ -79,6 +122,7 @@ if __name__ == "__main__": ...@@ -79,6 +122,7 @@ if __name__ == "__main__":
'createDate': time_now, 'createDate': time_now,
} }
r.sadd(info_code, newsUrl)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try: try:
kafka_result = producer.send("research_center_fourth", kafka_result = producer.send("research_center_fourth",
...@@ -90,3 +134,4 @@ if __name__ == "__main__": ...@@ -90,3 +134,4 @@ if __name__ == "__main__":
print('发送kafka异常!') print('发送kafka异常!')
finally: finally:
producer.close() producer.close()
flag += 1
\ No newline at end of file
...@@ -56,7 +56,7 @@ if __name__=="__main__": ...@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
# 可改动 # 可改动
time.sleep(20) time.sleep(80)
s = requests.session() s = requests.session()
#获取到token和cookies #获取到token和cookies
......
...@@ -170,5 +170,71 @@ for data in datas: ...@@ -170,5 +170,71 @@ for data in datas:
# f.write(dic_info_) # f.write(dic_info_)
# break # break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers) # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers) req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text) log.info(req.text)
# import re, datetime
#
#
# def paserTime(publishtime):
# timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
# current_datetime = datetime.datetime.now()
# publishtime = publishtime.strip()
# print(publishtime)
#
# try:
# if '年前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=365 * day)
# publishtime = current_datetime - delta
# elif '月前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(months=day)
# publishtime = current_datetime - delta
# elif '周前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(weeks=day)
# publishtime = current_datetime - delta
# elif '天前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=day)
# publishtime = current_datetime - delta
# elif '前天' in publishtime:
# delta = datetime.timedelta(days=2)
# publishtime = current_datetime - delta
# elif '昨天' in publishtime:
# current_datetime = datetime.datetime.now()
# delta = datetime.timedelta(days=1)
# publishtime = current_datetime - delta
# elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
# if '小时' in publishtime:
# hour = publishtime.split("小时")[0]
# else:
# hour = 0
# if hour != 0:
# min = publishtime.split("小时")[1].split("分钟")[0]
# else:
# min = publishtime.split("分钟")[0]
#
# delta = datetime.timedelta(hours=int(hour), minutes=int(min))
# publishtime = current_datetime - delta
# elif '年' in publishtime and '月' in publishtime:
# time_format = '%Y年%m月%d日'
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# elif '月' in publishtime and '日' in publishtime:
# current_year = current_datetime.year
# time_format = '%Y年%m月%d日'
# publishtime = str(current_year) + '年' + publishtime
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# except Exception as e:
# print('时间解析异常!!')
# return publishtime
#
# if __name__ == "__main__":
# publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论