提交 4a3bd851 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -449,6 +449,21 @@ def omeng(): ...@@ -449,6 +449,21 @@ def omeng():
# r.rpush('gnOMEnterprise_socialcode:Notice', item) # r.rpush('gnOMEnterprise_socialcode:Notice', item)
closeSql(cnx, cursor) closeSql(cnx, cursor)
#单项冠军
def danxiangguanjun():
pass
#科改示范
def kegaishifan():
pass
#双百企业
def shuangbaiqiye():
pass
#专精特新
def zhuangjingtexind():
pass
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
......
...@@ -46,8 +46,8 @@ def doJob(): ...@@ -46,8 +46,8 @@ def doJob():
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(xydm) retData = getTycIdByXYDM(xydm)
if retData: if retData['state']:
tycid = retData['id'] tycid = retData['tycData']['id']
# todo:写入数据库 # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'" updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql) cursor_.execute(updateSql)
......
...@@ -101,8 +101,8 @@ def spider(com_name,cik,up_okCount): ...@@ -101,8 +101,8 @@ def spider(com_name,cik,up_okCount):
#解析页面 #解析页面
for nnn in range(0,4): for nnn in range(0,4):
try: try:
req = requests.get(url=url_json,headers=header,proxies=ip_dic,verify=False,timeout=30) # req = requests.get(url=url_json,headers=header,proxies=ip_dic,verify=False,timeout=30)
# req = requests.get(url=url_json, headers=header, verify=False, timeout=30) req = requests.get(url=url_json, headers=header, verify=False, timeout=30)
break break
except: except:
time.sleep(2) time.sleep(2)
......
...@@ -41,6 +41,14 @@ type_map = { ...@@ -41,6 +41,14 @@ type_map = {
'9605':'公司公告', '9605':'公司公告',
'9533':'公司公告', '9533':'公司公告',
} }
type_id_map = {
'公司公告': '8',
'股转公告': '9',
'挂牌审核': '10',
'自律监管措施': '11',
'问询函': '12',
'纪律处分': '13'
}
def secrchATT(item_id, name, type_id): def secrchATT(item_id, name, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
...@@ -157,7 +165,7 @@ def InsterInto(short_name, social_code, pdf_url): ...@@ -157,7 +165,7 @@ def InsterInto(short_name, social_code, pdf_url):
return insert return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num,kfkid):
#上传至文件服务器 #上传至文件服务器
retData = baseCore.upLoadToServe(pdf_url,8,social_code) retData = baseCore.upLoadToServe(pdf_url,8,social_code)
#附件插入att数据库 #附件插入att数据库
...@@ -192,7 +200,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -192,7 +200,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
'sourceAddress': pdf_url, # 原文链接 'sourceAddress': pdf_url, # 原文链接
'summary': '', 'summary': '',
'title': pdf_name, 'title': pdf_name,
'type': 3, 'type': kfkid,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': year 'year': year
} }
...@@ -241,6 +249,7 @@ def SpiderByZJH(url, dic_info, start_time,num): # dic_info 数据库中获取 ...@@ -241,6 +249,7 @@ def SpiderByZJH(url, dic_info, start_time,num): # dic_info 数据库中获取
pdf_url = 'https://www.neeq.com.cn' + rp['destFilePath'] pdf_url = 'https://www.neeq.com.cn' + rp['destFilePath']
name_pdf = rp['disclosureTitle'] name_pdf = rp['disclosureTitle']
rp_type = type_map[rp['disclosureType']] rp_type = type_map[rp['disclosureType']]
kfkid = type_id_map[rp_type]
publishDate = rp['publishDate'] publishDate = rp['publishDate']
year = publishDate[:4] year = publishDate[:4]
# 数据入库 # 数据入库
...@@ -250,7 +259,7 @@ def SpiderByZJH(url, dic_info, start_time,num): # dic_info 数据库中获取 ...@@ -250,7 +259,7 @@ def SpiderByZJH(url, dic_info, start_time,num): # dic_info 数据库中获取
# okCount = okCount + 1 # okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
log.info(f'======={short_name}===========插入公告库成功') log.info(f'======={short_name}===========插入公告库成功')
result = GetContent(pdf_url, name_pdf, social_code, year, publishDate, start_time, com_name, num) result = GetContent(pdf_url, name_pdf, social_code, year, publishDate, start_time, com_name, num,kfkid)
if result: if result:
# 公告信息列表 # 公告信息列表
...@@ -300,17 +309,18 @@ if __name__ == '__main__': ...@@ -300,17 +309,18 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# # 获取企业信息 # # 获取企业信息
# # social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode') social_code = baseCore.redicPullData('NQEnterprise:nq_finance')
social_code = '9110000071092841XX' # social_code = '9110000071092841XX'
com_code = '430045' # com_code = '430045'
short_name = '超毅网络' # short_name = '超毅网络'
dic_info = {} dic_info = {}
# # 判断 如果Redis中已经没有数据,则等待 # # 判断 如果Redis中已经没有数据,则等待
# if social_code == None: if social_code == None:
# time.sleep(20) time.sleep(20)
# continue continue
# dic_info = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
# count = dic_info[16] com_code = data[3]
short_name = data[4]
url = 'https://www.neeq.com.cn/disclosureInfoController/productInfoResult.do' url = 'https://www.neeq.com.cn/disclosureInfoController/productInfoResult.do'
#翻页 page 0~ 25 totalPages #翻页 page 0~ 25 totalPages
......
...@@ -56,7 +56,7 @@ if __name__=="__main__": ...@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
# 可改动 # 可改动
time.sleep(60) time.sleep(20)
s = requests.session() s = requests.session()
#获取到token和cookies #获取到token和cookies
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论