下载pdf文件

0c9bb0cd · 丁双波 · 8be2a4ec · 0c9bb0cd · 0c9bb0cd
--- a/tmp/usVsRussia/downPdf.py
+++ b/tmp/usVsRussia/downPdf.py
+#下载pdf文件
+import os
+from datetime import time
+import pymysql
+import requests
+import urllib3
+from pymysql.converters import escape_string
+from base.BaseCore import BaseCore
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+headers = {
+    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+    'accept-encoding': 'gzip, deflate, br',
+    'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+    'cache-control': 'max-age=0',
+    # 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
+    'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': "Windows",
+    'sec-fetch-dest': 'document',
+    'sec-fetch-mode': 'navigate',
+    'sec-fetch-site': 'same-origin',
+    'sec-fetch-user': '?1',
+    'upgrade-insecure-requests': '1',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
+}
+baseCore = BaseCore()
+log =baseCore.getLogger()
+cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                      charset='utf8mb4')
+cursor = cnx.cursor()
+def get_file_name(headers):
+    filename = ''
+    if 'Content-Disposition' in headers and headers['Content-Disposition']:
+        disposition_split = headers['Content-Disposition'].split(';')
+        if len(disposition_split) > 1:
+            if disposition_split[1].strip().lower().startswith('filename='):
+                file_name = disposition_split[1].split('=')
+                if len(file_name) > 1:
+                    filename = file_name[1]
+    if not filename:
+        return baseCore.getNextSeq()+".pdf"
+    return filename
+def downFile(url,path):
+    try:
+        baseCore.mkPath(path)
+        proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
+        response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
+        fileName = get_file_name(response.headers)
+        with open(os.path.join(path, fileName), "wb") as pyFile:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    pyFile.write(chunk)
+    except Exception as e:
+        log.error(f"出错了----------{e}")
+        return False
+    return fileName
+if __name__ == '__main__':
+    while True :
+        selectSql = f"select id,url,website,ftype,stype,ttype from usvsrussia where state=0  order by id asc limit 1"
+        cursor.execute(selectSql)
+        data = cursor.fetchone()
+        if data:
+            id=data[0]
+            url=data[1]
+            website=data[2]
+            ftype=data[3]
+            stype=data[4]
+            ttype=data[5]
+            path=r'D:\美国VS俄罗斯制裁'
+            log.info(f"开始处理{url}----")
+            if website:
+                path = os.path.join(path, website)
+            if ftype:
+                path = os.path.join(path, ftype)
+            if stype:
+                path = os.path.join(path, stype)
+            if ttype:
+                path = os.path.join(path, ttype)
+            fileName = downFile(url,path)
+            if fileName:
+                updateSql = f"update usvsrussia set state=1,pdf_name='{fileName}' ,pdf_path='{escape_string(path)}' where id={id}"
+                log.info(f"开始处理{url}----处理ok")
+            else:
+                updateSql = f"update usvsrussia set state=2 where id={id}"
+                log.info(f"开始处理{url}----处理error")
+            cursor.execute(updateSql)
+            cnx.commit()
+        else:
+            log.info("数据处理完毕，程序退出")
+            break
+    url = 'https://ofac.treasury.gov/media/931946/download?inline'
+    log.info(f"{url}----开始下载")
+    downFile(url)
+    log.info(f"{url}----开始下载，下载完成")
+    baseCore.close()
+    cursor.close()
+    cnx.close()
\ No newline at end of file
--- a/tmp/usVsRussia/ofac.py
+++ b/tmp/usVsRussia/ofac.py
@@ -790,13 +790,206 @@ def job4():
        cursor.execute(insertSql)
        cnx.commit()
    driverContent.close()
+def job5():
+    log.info("开始采集----第二层数据采集")
+    path = r'E:\chromedriver_win32\115\chromedriver.exe'
+    driverContent = baseCore.buildDriver(path, headless=False)
+    url='https://ofac.treasury.gov/sanctions-programs-and-country-information/non-english-translations-of-advisories-and-other-documents#ru_food_security'
+    driverContent.get(url)
+    ftype = "Russian Harmful Foreign Activities Sanctions"
+    # TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype='TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="ru_food_security"]/ul/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        #time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    # TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS FACT SHEET'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[1]/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        # time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    # TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[2]/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        # time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    # TRANSLATIONS OF GLOBAL SHIPPING ADVISORY
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype = 'TRANSLATIONS OF GLOBAL SHIPPING ADVISORY'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[3]/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        # time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    # TRANSLATIONS OF NORTH KOREAN SHIPPING ADVISORIES
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype = 'Translations of North Korean Shipping Advisories'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[4]/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        # time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    # TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY
+    stype = 'Non-English Translations of Advisories and Other Documents'
+    ttype = 'TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/div[8]/div/ul/li')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = liEle.text
+        href = aEle.get_attribute('href')
+        time = baseCore.getSubStr(text, '(', ')')
+        # time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    driverContent.close()
+def job6():
+    log.info("开始采集----第二层数据采集")
+    path = r'E:\chromedriver_win32\115\chromedriver.exe'
+    driverContent = baseCore.buildDriver(path, headless=False)
+    url='https://ofac.treasury.gov/sanctions-programs-and-country-information/iran-sanctions/interpretative-rulings-on-ofac-policy'
+    driverContent.get(url)
+    ftype = "Russian Harmful Foreign Activities Sanctions"
+    stype = 'Interpretative Rulings on OFAC Policy'
+    log.info(f"开始采集栏目---{stype}---")
+    liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-11996"]/div/div/div[3]/div/div/table/tbody/tr')
+    for liEle in liEles:
+        aEle = liEle.find_element(By.TAG_NAME, 'a')  # a标签
+        text = aEle.text
+        href = aEle.get_attribute('href')
+        time = liEle.find_element(By.TAG_NAME, 'th').text  # a标签
+        #time = ''
+        selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
+        cursor.execute(selectCountSql)
+        count = cursor.fetchone()[0]
+        if count > 0:
+            log.info("已采集，跳过")
+            continue
+        else:
+            pass
+        insertSql = f"insert into  usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
+                    f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
+                    f"'{href}','{escape_string(text)}','{time}',0)"
+        # log.info(insertSql)
+        cursor.execute(insertSql)
+        cnx.commit()
+    driverContent.close()
 if __name__ == '__main__':
    log.info("美国财政部外国资产控制办公室 (OFAC)网站开始采集")
-    job1()
+    #job1()
-    job2()
+    #job2()
-    job3()
+    #job3()
-    job4()
+    #job4()
+    #job5()
+    job6()
 baseCore.close()
 cursor.close()
 cnx.close()
\ No newline at end of file