提交 a97764ed 作者: 薛凌堃

9/14

上级 495275b6
......@@ -339,7 +339,7 @@ if __name__ == '__main__':
continue
dic_info = baseCore.getInfomation(social_code)
log.info(f'----当前企业{social_code}--开始处理---')
count = dic_info[13]
count = dic_info[14]
com_name = dic_info[1]
social_code = dic_info[2]
#企查查id
......
......@@ -66,8 +66,8 @@ def getTycIdByXYDM(xydm):
def updateTycInfo():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
social_code = '9111000066990444XF'
social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
# social_code = '9111000066990444XF'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......@@ -88,7 +88,7 @@ def updateTycInfo():
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
tycid = retData['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
......
import json
import redis
from bs4 import BeautifulSoup
import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
# updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# # print(updateBeginSql)
# cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
# cnx_.commit()
import time
# from getTycId import getTycIdByXYDM
# social_code = '91440101231247350J'
# data = baseCore.getInfomation(social_code)
# tycid = data[11]
# if tycid == None:
# print(data)
# retData = getTycIdByXYDM(social_code)
# tycid = retData['tycData']['id']
# print(tycid)
# time_struct = time.localtime(int(1692762780000 / 1000)) # 首先把时间戳转换为结构化时间
# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间
# print(time_format)
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
# #原键名
# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode'
# #目标键名
# key2 = 'NewsEnterpriseFbs:gnqy_socialCode'
# values = r.lrange(key1,0,-1)
# for value in values:
# r.rpush(key2, value)
#
# # 关闭Redis连接
# r.close()
list_all = []
if list_all:
print(len(list_all))
else:
print('---')
"""
企业上市信息:只有上市的企业才能如企业库,未上市企业跳过采集步骤。退市企业标注为0
"""
import json
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# from gpdm import Gpdm
baseCore = BaseCore()
# chromedriver = r"E:\kkwork\zzsn_spider\comData\ipoInfo\chromedriver.exe"
# browser = webdriver.Chrome(chromedriver)
taskType = '上市信息/东方财富网/新三板'
# gpdm = Gpdm()
# gpdmList = gpdm.doJob()
log = baseCore.getLogger()
error_list = []
list_all_info = []
# 需要提供股票代码、企业信用代码
while True:
#从表中读取企业
# com_code1 = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code = '838616'
short_name = ''
social_code = ''
start = time.time()
log.info(f'======开始采集{com_code}======')
url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'qgqp_b_id=28edcf226f056ee077983f40f115eacf; st_si=15067486119520; emshistory=%5B%22%E4%BA%A7%E4%B8%9A%E9%93%BE%22%2C%22sz007sz%22%5D; websitepoptg_show_time=1694403032729; HAList=ty-0-002342-%u5DE8%u529B%u7D22%u5177%2Cty-0-301192-%u6CF0%u7965%u80A1%u4EFD%2Cty-1-688382-%u76CA%u65B9%u751F%u7269-U%2Cty-1-600895-%u5F20%u6C5F%u9AD8%u79D1%2Cty-1-600669-*ST%u978D%u6210%2Cty-116-00691-%u5C71%u6C34%u6C34%u6CE5%2Cty-0-300865-%u5927%u5B8F%u7ACB%2Cty-0-000656-%u91D1%u79D1%u80A1%u4EFD%2Cty-1-600257-%u5927%u6E56%u80A1%u4EFD%2Cty-1-688981-%u4E2D%u82AF%u56FD%u9645; xsb_history=833658%7C%u94C1%u8840%u79D1%u6280%2C838616%7C%u5317%u9CD0%u98DF%u54C1; st_asi=delete; st_pvi=44810095342512; st_sp=2023-07-18%2013%3A55%3A09; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=337; st_psi=20230914142347564-119112305908-4534169252',
'Host': 'xinsanban.eastmoney.com',
'Pragma': 'no-cache',
'Referer': 'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
req = requests.get(url=url,headers=headers)
reslut = BeautifulSoup(req.content,'html.parser')
# print(reslut)
li_list = reslut.find('div',id='company_info').find('ul',class_='company-page-left').find_all('li')
security = reslut.find('div',id='security_info').find('ul',class_='company-page-right').find_all('li')
listingDate = security[1].find('span',class_='company-page-item-right').text
businessScope = li_list[7].find('span',class_='company-page-item-right').text
industry = li_list[8].find('span',class_='company-page-item-right').text
secutities_type = '新三板'
category = '3'
exchange = '1'
dic_cwsj = {
"exchange": exchange,
"category": category, # 股票类型(1-A股;2-B股;3-新三板;4-H股)
'listed': '1',
"listingDate": listingDate,
"securitiesCode": com_code,
"securitiesShortName": short_name,
"securitiesType": secutities_type,
"socialCreditCode": social_code,
"businessScope": businessScope,
"eastIndustry": industry,
"csrcIndustry": ''
}
print(dic_cwsj)
break
# dic_cwsj = {
# "exchange": jys_code,
# "category": '1', # 股票类型(1-A股;2-B股;3-新三板;4-H股)
# 'listed':'1',
# "listingDate": shangshishijian,
# "securitiesCode": com_code[2:],
# "securitiesShortName": short_name,
# "securitiesType": '新三板',
# "socialCreditCode": id_code,
# "businessScope": zhuyingfanwei,
# "eastIndustry": dongcai,
# "csrcIndustry": zhengjian
# }
#
# list_all_info.append(dic_cwsj)
# log.info(f'======{com_code}====采集成功=====')
#
# # 通过接口将数据保存进数据库
# for num in range(0, len(list_all_info),100):
#
# json_updata = json.dumps(list_all_info[num:num+100])
# # print(json_updata)
# try:
# response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
# verify=False)
# except Exception as e:
# print(e)
# print("{}:到:{}".format(num, num + 100))
# print(response.text)
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
......@@ -19,10 +19,7 @@ from fdfs_client.client import get_tracker_conf, Fdfs_client
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor_ = cnx_.cursor()
cnx = baseCore.cnx
......@@ -32,7 +29,7 @@ cursor = baseCore.cursor
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会'
taskType = '企业公告/证监会/福布斯'
def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)]
......@@ -142,30 +139,25 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
cursor_.execute(sel_sql, (social_code, pdf_url))
selects = cursor_.fetchone()
cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone()
if selects:
print(f'com_name:{short_name}、{pdf_url}已存在')
return inster
# 信息插入数据库
# todo:信息插入数据库,更换数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,author,type) values(%s,%s,%s,%s,%s)'''
list_info = [
social_code,
name_pdf,
'', # 摘要
'', # 正文
pub_time, # 发布时间
pdf_url,
'证监会',
report_type,
'1',
'zh'
'1'
]
cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit()
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
insert = True
return insert
except:
......@@ -201,6 +193,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:附件id需要上传至att数据库
dic_news = {
'attachmentIds': id,
'author': '',
......@@ -372,33 +365,29 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# if social_code == None:
# time.sleep(20)
# continue
if social_code == None:
time.sleep(20)
continue
# 获取企业信息
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 is Null limit 1 "
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 ='1' limit 1 "
# 兴业银行
query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor.execute(query)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
# cursor.execute(query)
# row = cursor.fetchone()
# if row:
# pass
# else:
# print('没有数据了,结束脚本')
# break
# tycid = row[14]
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 1表示拿到数据
updateBeginSql = f"update Tfbs_bak set state2='1',date1='{time_now}' where col3='{social_code}' "
cursor.execute(updateBeginSql)
cnx.commit()
dic_info = baseCore.getInfomation(social_code)
com_name = dic_info[1]
social_code = dic_info[2]
code = dic_info[3]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
count = dic_info[16]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
......@@ -412,7 +401,6 @@ if __name__ == '__main__':
# 根据股票代码选链接
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所
code = dic_info[3]
short_name = dic_info[4]
dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
......
import policy
import tingtype
import BaseCore
from apscheduler.schedulers.blocking import BlockingScheduler
basecore = BaseCore.BaseCore()
log = basecore.getLogger()
def policylaw_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(policy, 'cron', hour=0,minute=0)
scheduler.add_job(tingtype, 'cron', hour=0, minute=0)
try:
scheduler.start()
except Exception as e:
log.info('定时采集异常', e)
pass
policylaw_task()
\ No newline at end of file
import json
import datetime
import datetime
import json
import random
import time
......@@ -277,6 +278,10 @@ if __name__=='__main__':
job()
except Exception as e:
print(e)
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
......
import json
from bs4 import BeautifulSoup
import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# print(updateBeginSql)
cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
cnx_.commit()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论