提交 b52e4502 作者: 薛凌堃

2/26

上级 ca40e9aa
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
import pandas as pd
import redis
def putCom():
com_list = ['91210000558190456G', '914200001000115161', '911100007109310534', '9111000071093123XX',
'91110000100017643K', '91110000100018267J', '91110000MA01P657XY', '91230100127057741M',
'91440300190346175T', 'ZZSN22083000000003', '91110000400000720M', '911100001055722912',
'91110000100005220B', '911100001000094165', '91310000132200821H', '911100001000128855',
'91110000710924910P', '91110000710924929L', '911100007109225442', '9111000071092649XU',
'91310000MA1FL70B67', '911100007109311097', '912201011239989159', '911100007178306183',
'91310000MA7ALG04XG', '91110000100017707H', '91110000710929498G', '91110000100010249W',
'9151000062160427XG', '91310000MA1FL4B24G', '91110000400001889L', '9144030010001694XX',
'91110000100000825Q', '91110000100006194G', '91110000717828315T', '91110000100001043E',
'91110000MA005UCQ5P', '91110000710935732K', '91110000710930392Y', '91110000710930296M',
'911100007109303176', '91110000710925243K', '91110000100014071Q', '91110000100009563N',
'9111000071093107XN', '9111000010001002XD', '91110000100001852R', '91110000100001625L',
'911100001000080343', '91110000400008060U', '91110000101699383Q', '91110000100000489L',
'9111000071092868XL', '91110000100001035K', '911100004000011410', '91110000710933809D',
'91110000100010310K', '91133100MABRLCFR5Q', '91110000MA001HYK9X', '911100001000016682',
'911100007109279199', '12100000400010275N', '91110000710935636A', '91110000100024800K',
'9144000076384341X8', '91440000100005896P', '91110000MA01W8B394', '91110000717830650E',
'91110000100003057A', 'ZZSN22061600000001', '91310000MA1FL0LX06', '9111000010169286X1',
'91110000100010433L', '91110000100010660R', '91110000102016548J', '91110000100001676W',
'9111000071092200XY', '91133100MA0G9YKT8B', '9111000010000093XR', '91110000100006485K',
'91360702MA7FK4MR44', '91420100MA4L0GG411', '91110000101625149Q', '12100000400006022G',
'912302001285125661', '91110000100005888C', '911100007109250324', '91110000100024915R',
'9111000040000094XW', '91310000MA1FL1MMXL', '91110000100015058K', '91110000710929930X',
'91133100MA0GBL5F38', '9111000010000085X6', '91110000101100414N']
df = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx')
# 连接到Redis数据库
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
for i in range(len(df)):
social_code = df['social_code'][i]
com_name = df['name'][i]
# print(social_code)
if social_code in com_list:
pass
else:
if 'ZZSN' in social_code or 'ZD' in social_code:
continue
else:
item = social_code + '|' + com_name
r.rpush('UpdateBasdeInfo:SocialCode_CompanyName', item)
def putCom_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每个月执行一次
scheduler.add_job(putCom, 'cron', day=1, hour=0, minute=0)
try:
# redisPushData # 定时开始前执行一次
# putCom()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
if __name__ == '__main__':
putCom_task()
\ No newline at end of file
import pandas as pd
# from pandas import DataFrame as df
import pymysql
import redis
# 连接到Redis
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
with cnx.cursor() as cursor:
select = """select relationName, relationId from klb_company"""
cursor.execute(select)
results = cursor.fetchall()
for result in results:
name = result[0]
xydm = result[1]
item = f'{name}|{xydm}'
r.rpush('SousuoBaidu:companyname', cell_value)
# 列表名称
list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements = r.lrange(list_name, 0, -1)
# 遍历列表中的元素
for element in elements:
# 获取元素在列表中的数量
count = r.lrem(list_name, 0, element)
# 如果数量大于1,说明有重复值,删除多余的重复值
if count > 1:
r.lrem(list_name, count - 1, element)
# 打印处理后的列表
print(r.lrange(list_name, 0, -1))
# 中央全面深化改革委员会会议
import json
import sys
import time
import redis
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
......@@ -26,22 +32,50 @@ headers = {
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'news.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
request = requests.get(url=url, headers=header)
soup = BeautifulSoup(request.content, 'html.parser')
# print(soup)
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
# info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
flag = 1
for info_html in info_html_list:
if flag == 1:
info_code = 'IN-20230816-0004'
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
info_code = 'IN-20230816-0005'
ul_list = info_html.find('ul', class_='ul_list').find_all('li')
for ul in ul_list[::-1]:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
......@@ -51,18 +85,27 @@ if __name__ == "__main__":
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
try:
flag = r.sismember(info_code, newsUrl)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
# print(news_soup)
try:
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
except Exception as e:
log.error(f'解析网页出错{newsUrl}')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
......@@ -79,6 +122,7 @@ if __name__ == "__main__":
'createDate': time_now,
}
r.sadd(info_code, newsUrl)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
......@@ -90,3 +134,4 @@ if __name__ == "__main__":
print('发送kafka异常!')
finally:
producer.close()
flag += 1
\ No newline at end of file
......@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(20)
time.sleep(80)
s = requests.session()
#获取到token和cookies
......
......@@ -170,5 +170,71 @@ for data in datas:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text)
# import re, datetime
#
#
# def paserTime(publishtime):
# timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
# current_datetime = datetime.datetime.now()
# publishtime = publishtime.strip()
# print(publishtime)
#
# try:
# if '年前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=365 * day)
# publishtime = current_datetime - delta
# elif '月前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(months=day)
# publishtime = current_datetime - delta
# elif '周前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(weeks=day)
# publishtime = current_datetime - delta
# elif '天前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=day)
# publishtime = current_datetime - delta
# elif '前天' in publishtime:
# delta = datetime.timedelta(days=2)
# publishtime = current_datetime - delta
# elif '昨天' in publishtime:
# current_datetime = datetime.datetime.now()
# delta = datetime.timedelta(days=1)
# publishtime = current_datetime - delta
# elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
# if '小时' in publishtime:
# hour = publishtime.split("小时")[0]
# else:
# hour = 0
# if hour != 0:
# min = publishtime.split("小时")[1].split("分钟")[0]
# else:
# min = publishtime.split("分钟")[0]
#
# delta = datetime.timedelta(hours=int(hour), minutes=int(min))
# publishtime = current_datetime - delta
# elif '年' in publishtime and '月' in publishtime:
# time_format = '%Y年%m月%d日'
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# elif '月' in publishtime and '日' in publishtime:
# current_year = current_datetime.year
# time_format = '%Y年%m月%d日'
# publishtime = str(current_year) + '年' + publishtime
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# except Exception as e:
# print('时间解析异常!!')
# return publishtime
#
# if __name__ == "__main__":
# publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论