提交 62fa9e22 作者: XveLingKun

20241104

上级 e943c06c
这是一段示例文本。
\ No newline at end of file
这是另一段示例文本。
\ No newline at end of file
......@@ -1387,7 +1387,17 @@ def getkeywords(keywords):
kwList=k3
return kwList
def test31():
aaa = []
def test():
return 1, 2
c = test()
aaa.append((3, c))
print(aaa)
for i, j in aaa:
print(i)
print(j)
pass
if __name__ == "__main__":
# # import queue
......@@ -1512,33 +1522,34 @@ if __name__ == "__main__":
# print(aaa)
# aaa = int("07")
# print(aaa)
title = "党建论文│工控科产党委“1+2+V”大党建工作格局推动党建工作与生产经营深度融合"
content = "党建工作和深度融合"
keywords = "(浙江|北京)+(尼日利亚|科特迪瓦)+(活动|访问)"
keywords_split = getkeywords(keywords)
print(keywords_split)
tf_title = 0 # 统计当前规则中的关键词在标题中出现的次数
tf_content = 0 # 统计当前规则中的关键词在内容中出现的次数
for kw in keywords_split:
if "+" in kw:
# todo:2024-10-15 关键词需要同时出现 若没有同时出现则分数为0
kws = kw.split("+")
for k in kws:
c_t = str(title).lower().count(k)
c_c = str(content).lower().count(k)
if c_c:
# 如果文章中出现
tf_content += c_c
else:
tf_content = 0
break
if c_t:
tf_title += c_t
else:
tf_title = 0
break
else:
tf_title += str(title).lower().count(kw)
tf_content += str(content).lower().count(kw)
print(tf_title)
print(tf_content)
\ No newline at end of file
# title = "党建论文│工控科产党委“1+2+V”大党建工作格局推动党建工作与生产经营深度融合"
# content = "党建工作和深度融合"
# keywords = "(浙江|北京)+(尼日利亚|科特迪瓦)+(活动|访问)"
# keywords_split = getkeywords(keywords)
# print(keywords_split)
# tf_title = 0 # 统计当前规则中的关键词在标题中出现的次数
# tf_content = 0 # 统计当前规则中的关键词在内容中出现的次数
# for kw in keywords_split:
# if "+" in kw:
# # todo:2024-10-15 关键词需要同时出现 若没有同时出现则分数为0
# kws = kw.split("+")
# for k in kws:
# c_t = str(title).lower().count(k)
# c_c = str(content).lower().count(k)
# if c_c:
# # 如果文章中出现
# tf_content += c_c
# else:
# tf_content = 0
# break
# if c_t:
# tf_title += c_t
# else:
# tf_title = 0
# break
# else:
# tf_title += str(title).lower().count(kw)
# tf_content += str(content).lower().count(kw)
# print(tf_title)
# print(tf_content)
test31()
\ No newline at end of file
import requests, json, time, pymysql, sys
import requests, json, time, pymysql, sys
......@@ -3,7 +3,7 @@ import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
list_com_code = []
......@@ -38,12 +38,12 @@ list_all_info = []
for com_code in list_com_code[0:]:
print(com_code)
url_xueqiu = f'https://stock.xueqiu.com/v5/stock/f10/cn/company.json?symbol={com_code}'
json_xueqiu = requests.get(url_xueqiu, headers=headers).json()
dic_com = json_xueqiu['data']['company']
com_type = dic_com['classi_name']
# url_xueqiu = f'https://stock.xueqiu.com/v5/stock/f10/cn/company.json?symbol={com_code}'
#
# json_xueqiu = requests.get(url_xueqiu, headers=headers).json()
# dic_com = json_xueqiu['data']['company']
#
# com_type = dic_com['classi_name']
# com_money = dic_com['issue_price']*dic_com['actual_issue_vol']
url_dongfang = f'https://emweb.eastmoney.com/PC_HSF10/CompanySurvey/PageAjax?code={com_code}'
......@@ -123,8 +123,8 @@ for com_code in list_com_code[0:]:
list_all_info_tuple = []
for list_info in list_all_info:
list_all_info_tuple.append(tuple(list_info))
with cnx.cursor() as cursor:
Upsql = ''' update sys_base_enterprise_ipo set enterprise_type = %s,total_market_value = %s,before_total_market_value = %s,operating_revenue = %s,operating_revenue_rate = %s,profit = %s,profit_rate = %s,assets = %s,return_on_assets = %s,shareholders_equity = %s where securities_code = %s '''
cursor.executemany(Upsql, list_all_info_tuple)
cnx.commit()
#
# with cnx.cursor() as cursor:
# Upsql = ''' update sys_base_enterprise_ipo set enterprise_type = %s,total_market_value = %s,before_total_market_value = %s,operating_revenue = %s,operating_revenue_rate = %s,profit = %s,profit_rate = %s,assets = %s,return_on_assets = %s,shareholders_equity = %s where securities_code = %s '''
# cursor.executemany(Upsql, list_all_info_tuple)
# cnx.commit()
......@@ -167,7 +167,7 @@ if __name__ == '__main__':
print('---------------')
while True:
try:
codeids = ['KW-20220113-0004']
codeids = ['KW-20241021-0003']
for codeid in codeids:
try:
# keymsg=baiduTaskJob.getkafka()
......
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Awesome-pyecharts</title>
<script type="text/javascript" src="https://assets.pyecharts.org/assets/v5/echarts.min.js"></script>
</head>
<body >
<div id="9088f2762ffc48a5b9bc2d33ca7c777c" class="chart-container" style="width:900px; height:500px; "></div>
<script>
var chart_9088f2762ffc48a5b9bc2d33ca7c777c = echarts.init(
document.getElementById('9088f2762ffc48a5b9bc2d33ca7c777c'), 'white', {renderer: 'canvas'});
var option_9088f2762ffc48a5b9bc2d33ca7c777c = {
"animation": true,
"animationThreshold": 2000,
"animationDuration": 1000,
"animationEasing": "cubicOut",
"animationDelay": 0,
"animationDurationUpdate": 300,
"animationEasingUpdate": "cubicOut",
"animationDelayUpdate": 0,
"aria": {
"enabled": false
},
"color": [
"#5470c6",
"#91cc75",
"#fac858",
"#ee6666",
"#73c0de",
"#3ba272",
"#fc8452",
"#9a60b4",
"#ea7ccc"
],
"series": [
{
"type": "bar",
"name": "\u5546\u5bb6A",
"legendHoverLink": true,
"data": [
5,
20,
36,
10,
75,
85
],
"realtimeSort": false,
"showBackground": false,
"stackStrategy": "samesign",
"cursor": "pointer",
"barMinHeight": 0,
"barCategoryGap": "20%",
"barGap": "30%",
"large": false,
"largeThreshold": 400,
"seriesLayoutBy": "column",
"datasetIndex": 0,
"clip": true,
"zlevel": 0,
"z": 2,
"label": {
"show": true,
"margin": 8
}
}
],
"legend": [
{
"data": [
"\u5546\u5bb6A"
],
"selected": {}
}
],
"tooltip": {
"show": true,
"trigger": "item",
"triggerOn": "mousemove|click",
"axisPointer": {
"type": "line"
},
"showContent": true,
"alwaysShowContent": false,
"showDelay": 0,
"hideDelay": 100,
"enterable": false,
"confine": false,
"appendToBody": false,
"transitionDuration": 0.4,
"textStyle": {
"fontSize": 14
},
"borderWidth": 0,
"padding": 5,
"order": "seriesAsc"
},
"xAxis": [
{
"show": true,
"scale": false,
"nameLocation": "end",
"nameGap": 15,
"gridIndex": 0,
"inverse": false,
"offset": 0,
"splitNumber": 5,
"minInterval": 0,
"splitLine": {
"show": true,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid"
}
},
"data": [
"\u886c\u886b",
"\u7f8a\u6bdb\u886b",
"\u96ea\u7eba\u886b",
"\u88e4\u5b50",
"\u9ad8\u8ddf\u978b",
"\u889c\u5b50"
]
}
],
"yAxis": [
{
"show": true,
"scale": false,
"nameLocation": "end",
"nameGap": 15,
"gridIndex": 0,
"inverse": false,
"offset": 0,
"splitNumber": 5,
"minInterval": 0,
"splitLine": {
"show": true,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid"
}
}
}
]
};
chart_9088f2762ffc48a5b9bc2d33ca7c777c.setOption(option_9088f2762ffc48a5b9bc2d33ca7c777c);
</script>
</body>
</html>
test.png

78.9 KB

import json
import json
import re
import requests
import time
from zhipuai import ZhipuAI
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
def pre_zhipuai_http(content_list, prompt_abstract):
url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"
payload = json.dumps({
"model": "glm-4-flash",
"messages": [
{
"role": "user",
"content": prompt_abstract.replace('{content_list}', str(content_list))
}
]
})
headers = {
'Authorization': 'Bearer 6c8a99bde51835522a2af62ea71e6c0a.iOAOC3vaLYua2Rgr',
'Content-Type': 'application/json',
# 'Cookie': 'acw_tc=2760822c17247248406261468e6c541507ba9035f95078363549469047ee74'
}
# response = requests.request("POST", url, headers=headers, data=payload)
response = requests.post(url, headers=headers, json=payload, verify=True)
response = response.json()
return response['choices'][0]['message']['content']
def pre_zhipuai(content, prompt_abstract):
# zhipu
client = ZhipuAI(
api_key="6c8a99bde51835522a2af62ea71e6c0a.iOAOC3vaLYua2Rgr") # 填写您自己的APIKey
response = client.chat.completions.create(
model="glm-4-flash", # 填写需要调用的模型名称
messages=[
{"role": "user",
"content": prompt_abstract.replace('{content}', str(content))
}
],
)
llm_response = response.choices[0].message.content
return llm_response
def get_data(excel_file_path):
import pandas as pd
try:
df = pd.read_excel(excel_file_path).astype(str)
dict_from_df = df.to_dict(
orient='records') # 将DataFrame转换成字典,这里假设每列的列名作为字典的键,列中的数据作为值,'records'参数将DataFrame的每一行转换为一个字典
data_list = []
for row in dict_from_df:
data_list.append(row)
return data_list
except Exception as e:
log.info(f"读取excel文件时出错:{e}")
return []
def clean_text(content):
# content = BeautifulSoup(content, "html.parser").text
content_ = content.replace('\n', '')
content_result = re.sub(r'\s+', ' ', content_)
# print(content_result)
return content_result
def gml2(area, content):
extract_prompt = "请阅读下文,概括总结有关{area}的问题挑战有哪些?不可回答文中没有出现的内容,若无{area}相关问题,请回答“文中未提及”。\n################\n{context}"
length_threshold = 2089
ptuning_url = "http://116.63.179.212:7861/local_doc_qa/async_chat"
llm_payload = json.dumps({
"question": extract_prompt.replace("{context}", content).replace("{area}", area)[:length_threshold],
"history": [],
"llm_params": {
"max_length": 2089,
"top_p": 0.7,
"temperature": 0.1
}
})
temp_problem_ = ""
max_retry = 2
retry_count = 0
while retry_count < max_retry:
try:
headers = {'Content-Type': 'application/json'}
ptuning_response = requests.post(ptuning_url, headers=headers, data=llm_payload, timeout=50)
temp_response = ptuning_response.text
temp_resp_json = json.loads(temp_response)
temp_problem_ = temp_resp_json["response"]
break
except Exception as e:
log.info(f"当前处理异常的内容为:{extract_prompt[:100]}")
log.info(e)
time.sleep(10) # 等待10秒后再次请求
retry_count += 1
continue
return temp_problem_
def write_excel(write_file_path, result_dic_list):
import os
from openpyxl import Workbook, load_workbook
dicts = result_dic_list
if dicts: # 检查 dicts 是否不为空
if os.path.exists(write_file_path):
wb = load_workbook(write_file_path)
ws = wb.active
else:
wb = Workbook()
ws = wb.active
keyword = dicts[0]
ws.append(list(keyword.keys()))
for dict_item in dicts:
if isinstance(dict_item, dict):
ws.append(list(dict_item.values()))
else:
ws.append(list([]))
wb.save(write_file_path)
else:
log.info("警告:dicts 列表为空")
if __name__ == "__main__":
start = time.time()
excel_file_path = "D:\kkwork\知识库服务\data\企业家精神-数据库汇总_11765_近三年.xlsx"
write_file_path = "D:\kkwork\知识库服务\data\企业家精神-数据库汇总_11765_近三年_result.xlsx"
data_list = get_data(excel_file_path)
new_list = []
area = "企业家精神"
for i in data_list:
content = i['正文']
content = clean_text(content)
response = gml2(area, content)
i["问题"] = response
log.info(response)
new_list.append(i)
write_excel(write_file_path, new_list)
# response['choices'][0]['message']['content']
end = time.time()
log.info(end - start)
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import datetime
import os
import random
import sys
......@@ -216,11 +217,16 @@ class BaseCore:
except :
pass
def __init__(self):
# 连接到Redis
self.r = redis.Redis(host="114.116.90.53", port=6380, password='RPHZgkDQ4zGJ', db=6)
self.__cnx_proxy = pymysql.connect(host='1.95.78.131', user='caiji', password='zzsn9988', db='clb_project',
self.r = redis.Redis(host='114.116.90.53', port=6380, password='RPHZgkDQ4zGJ', db=6)
self.cnx = pymysql.connect(host='1.95.78.131', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
self.cursor = self.cnx.cursor()
# 11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
pass
# 计算耗时
......@@ -291,11 +297,62 @@ class BaseCore:
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
def get_proxyIPPort(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxy = {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
ip_list.append(proxy)
return ip_list
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.116.90.53", port=6380, password='RPHZgkDQ4zGJ', db=6)
self.r = redis.Redis(host="114.115.236.206", port=6379, password='RPHZgkDQ4zGJ', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
def getSidName(self, sid):
sqlSelect = f"SELECT words_name FROM `key_words` WHERE id = '{sid}'"
self.cursor_.execute(sqlSelect)
data = self.cursor_.fetchone()[0]
return data
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
def getUniqueCode(self, abbr, serverId, threadId):
while True:
timeCode = self.r.blpop(['timeCode'], 2)
if timeCode:
timeCode = timeCode[1]
timeCode = timeCode.decode('utf-8')
break
else:
time.sleep(2)
pid = str(self.getPID())
if len(pid) < 4:
pid = pid.zfill(4)
elif len(pid) > 4:
pid = pid[0:4]
threadId = str(threadId)
if len(threadId) > 1:
threadId = threadId[0]
uniqueCode = abbr + str(datetime.datetime.now().strftime('%Y%m%d'))[2:] + serverId + pid + str(threadId) + str(timeCode)
return uniqueCode
\ No newline at end of file
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import datetime
import json
import time
import pandas as pd
import pymongo
import redis
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from baiduSpider import BaiduSpider
searchkw, wordsCode, sid = '', '', ''
baidu = BaiduSpider(searchkw, wordsCode, sid)
import urllib3
# r = redis.Redis(host="114.116.90.53", port=6380, password='RPHZgkDQ4zGJ', db=6)
# db_storage = pymongo.MongoClient('mongodb://1.95.69.135:27017/', username='admin', password='ZZsn@9988').ZZSN[
# '天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# import sys
# sys.path.append('D:\\PycharmProjects\\zzsn\\base')
from baseCore import BaseCore
baseCore = BaseCore()
# 131
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
# 11
cnx = baseCore.cnx_
cursor = baseCore.cursor_
log = baseCore.getLogger()
r = baseCore.r
def selectSql():
sql = """select * from google_search_list where state=0"""
cursor_.execute(sql)
return cursor_.fetchall()
def getNowDate():
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
def getProcessitem(bdetail):
nowDate = getNowDate()
if content != '':
processitem = {
"sid": bdetail['sid'],
"source": "4",
"title": bdetail['title'],
"content": bdetail['content'],
"contentWithtag": bdetail['contentHtml'],
"origin": bdetail['origin'],
"publishDate": bdetail['publishDate'],
"sourceAddress": bdetail['detailurl'],
"createDate": nowDate
}
else:
processitem = {}
return processitem
def itemInsertToTable(item):
itemdata = []
nowtime = getNowDate()
data = (item['content'], item['contentHtml'], '1', nowtime, item['id'])
itemdata.append(data)
sql = "UPDATE google_search_list SET content=%s, content_with_tag=%s, state=%s, create_time=%s WHERE id=%s"
cursor_.executemany(sql, itemdata)
log.info("数据更新数据库成功!")
cnx_.commit()
def sendkafka(processitem):
producer = KafkaProducer(bootstrap_servers=['1.95.78.131:9092'])
try:
content = processitem['content']
publishDate = str(processitem['publishDate'])
title = processitem['title']
if title == '':
return
if content == '':
return
if publishDate == '':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
# self.logger.info("数据发送kafka成功")
log.info(kafka_result.get(timeout=10))
flg = True
except Exception as e:
flg = False
pass
# self.logger.info('发送kafka异常')
finally:
producer.close()
return flg
def sendMonitor(processitem):
log.info(processitem['uniqueCode'])
sidName = baseCore.getSidName(processitem['sid'])
monitor = {
"title": processitem['title'], # 标题
"sourceAddress": processitem['sourceAddress'], # 原文链接
"uniqueCode": processitem['uniqueCode'], # 唯一编码 采集类型+6位日期+服务器序列+线程序列+自定义数字
"operateType": "DATA_CRAWLER", # 操作类型 写死
"handlerBody": {
"success": True, # 处理成功或失败状态 写死
"handlerStatus": "CRAWLED" # 处理状态 写死
},
"source": {
"sourceId": processitem['sid'], # 信息源Id
"sourceName": sidName, # 信息源名称
"sourceType": 4, # 信息源类型 sourceType枚举字典
},
"processTime": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 处理时间 yyyy-MM-dd HH:mm:ss
"server": {
"serverIp": "94.74.96.195", # 所在服务器IP
"serverHostName": "数据采集服务", # 服务器名称
"processId": baseCore.getPID() # 进程Id
}
}
producer = KafkaProducer(bootstrap_servers=['1.95.78.131:9092'], max_request_size=1024 * 1024 * 20,
api_version=(2, 7, 0))
try:
kafka_result = producer.send("crawlerInfo", json.dumps(monitor, ensure_ascii=False).encode('utf8'))
log.info('监控数据发送Kafka成功')
except Exception as e:
monitor = json.dumps(monitor, ensure_ascii=False)
monitorDic = {
'lifecycle_data_crawler': monitor
}
# self.rMonitor.xadd('data_lifecycle_log_data_crawler-redis', monitorDic, id='*')
log.info('数据监控发送Kafka失败')
if __name__ == "__main__":
while True:
resultList = selectSql()
df_list = []
for result in resultList:
id_ = result[0]
title = result[1]
publishDate = result[2]
origin = result[3]
url = result[4]
sid = result[10]
wordsCode = result[11]
keyword = result[6]
try:
content, contentWithTag, title = baidu.extractorMsg(url, title)
contentWithTag = baidu.rmTagattr(contentWithTag, url)
except Exception as e:
content = ''
contentWithTag = ''
if len(content) < 100:
continue
soup = BeautifulSoup(contentWithTag, "html.parser")
# 查找所有带有class属性的元素
elements_with_class = soup.find_all(class_=True)
# 循环遍历元素并去掉class属性
for element in elements_with_class:
del element.attrs["class"]
contentHtml = str(soup)
detailmsg = {
'id': id_,
'sid': sid,
'title': title,
'detailurl': url,
'content': content,
'contentHtml': contentHtml,
'origin': origin,
'publishDate': publishDate
}
processitem = getProcessitem(detailmsg)
uniqueCode = baseCore.getUniqueCode('GG', '195', '1')
processitem['uniqueCode'] = uniqueCode
try:
flg = sendkafka(processitem)
if flg:
r.sadd('pygoogle_' + wordsCode, processitem['sourceAddress'])
# 插入数据库
try:
itemInsertToTable(detailmsg)
except Exception as e:
log.info(f"插入数据库失败!{keyword}===={url}")
log.info(f"放入kafka成功!{keyword}===={url}")
sendMonitor(processitem)
except Exception as e:
log.info(f"放入kafka失败!{keyword}===={url}")
# df_list.append(detailmsg)
# df = pd.DataFrame(df_list)
# df.to_excel("./测试结果.xlsx", index=False)
time.sleep(1)
\ No newline at end of file
from baiduSpider import BaiduSpider
from baiduSpider import BaiduSpider
......@@ -5,8 +5,8 @@ import requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# url = 'https://www.thepaper.cn/newsDetail_forward_26661172'
url = 'https://finance.huanqiu.com/article/9CaKrnK5O7o' # 澎湃新闻 虎嗅APP 经济观察网
title = '中国建材集团董事长宋志平:激发和保护企业家精神'
url = 'https://www.ctnews.com.cn/huanqiu/content/2024-10/08/content_165713.html' # 澎湃新闻 虎嗅APP 经济观察网
title = '中国味道——大运河美食体验工作坊亮相尼日利亚'
try:
detailurl = url
title = title
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论