提交 cb908caf 作者: 薛凌堃

政策法规

上级 593410c8
import json
import json
......@@ -124,78 +124,6 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
print(f'com_name:{short_name}、{year}已存在')
continue
else:
# # 类型为年报的话就解析该年报pdf,并入库
# for i in range(0, 3):
# try:
# resp_content = requests.request("GET", pdf_url).content
# # 获取pdf页数
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# break
# except Exception as e:
# print(e)
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'==={short_name}、{year}===pdf解析失败')
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
# continue
# result = ''
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# break
# except Exception as e:
# print(e)
# time.sleep(3)
# continue
# if result == '':
# e = '上传服务器失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# continue
#
# if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
#
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#
# type_id = '1'
# item_id = dic_info['social_code']
# group_name = 'group1'
#
# path = bytes.decode(result['Remote file_id']).replace('group1', '')
# full_path = bytes.decode(result['Remote file_id'])
# category = 'pdf'
# file_size = result['Uploaded size']
# order_by = num
# status = 1
# create_by = 'XueLingKun'
# create_time = time_now
# page_size = page_size
# try:
# tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
# category, file_size, order_by, status, create_by, create_time, page_size)
# state = 1
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
# except:
# e = '数据库传输失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# num = num + 1
# time.sleep(2)
# else:
# e = '采集失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# continue
#上传至文件服务器
retData = baseCore.upLoadToServe(pdf_url, 1, social_code)
#插入数据库获取att_id
num = num + 1
......
......@@ -47,41 +47,6 @@ def replaceUrl(hostUrl,src):
finnal_href = hostUrl + src
return finnal_href
def attachjob(fu_jian_soup,href):
for fu_jian_tag in fu_jian_soup:
try:
# 附件链接
fu_jian_href = fu_jian_tag['href']
pass
except:
continue
# todo:将链接替换为绝对路径
# todo:将附件上传至文件服务器,并返回文件服务器路径和attid,并替换 不用解析内容
if '.html' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
if 'http' in fu_jian_href:
pass
else:
# 计算有多少个../
if '../' in fu_jian_href:
count = fu_jian_href.count("../")
if count == 1:
hostUrl = 'https://gzw.beijing.gov.cn/xxfb/zcfg/'
if count == 2:
hostUrl = 'https://gzw.beijing.gov.cn/xxfb/'
if count == 3:
hostUrl = 'https://gzw.beijing.gov.cn/xxfb/'
else:
if './' in fu_jian_href:
hostUrl = href.split('/t')[0]
# 替换为绝对路径
fin_fj_href = replaceUrl(hostUrl, fu_jian_href)
# 将新路径替换标签中的路径
fu_jian_tag['href'] = fin_fj_href
return fu_jian_soup
def save_data(result_dict):
try:
aa = result_dict['信息来源']
......@@ -487,6 +452,21 @@ def get_content3():
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
# 北京
def bei_jing():
......@@ -556,15 +536,14 @@ def bei_jing():
cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
soup_cont = BeautifulSoup(cont,'lxml')
fu_jian_soup = soup_cont.find_all('a')
attachjob(fu_jian_soup,href[0])
print(fu_jian_soup)
# print(fu_jian_soup)
print(soup_cont)
print(title)
soup = paserUrl(soup_cont, href)
text = str(soup.prettify())
print(text)
# print(title)
num = 0
fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup:
num+=1
file_href = file['href']
......
......@@ -58,7 +58,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(30)
time.sleep(70)
s = requests.session()
#获取到token和cookies
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论