提交 4e84d611 作者: 薛凌堃

10/26

上级 c2749092
...@@ -66,12 +66,15 @@ def NewsEnterprise(): ...@@ -66,12 +66,15 @@ def NewsEnterprise():
# gw_social_list = [item[0] for item in gw_result] # gw_social_list = [item[0] for item in gw_result]
#todo:打印长度 #todo:打印长度
# print(len(gw_social_list)) # print(len(gw_social_list))
gn_social_list = [item[0] for item in gn_result] # gn_social_list = [item[0] for item in gn_result]
aaa = '91320500703676365K,91210000242666665H,9111010810114488XN,91110000633715962Q,91442000618120215D,9112000013270080XR,91330200144069541X,911100006004827014,91310115703490552J,91441300791200462B,914405007254810917,91320400608117856C,91310000607311067X,913101156822157531,91320200673924654N,9144010173974661X9,912102002423997128,91530000719480244Y,913300001469343082,911100007715928418,911100001013053805,91330784771942047G,91330000704206605E,91310000631658829P,91370000163044841F,91330481673858589X,91330100143779306C,91310000566558740F,91310000729394470F,91320583753203830Q,91320214MA1MLB3M2A,91441900745512430D,9131000070322836XD,91110108565775188E,91330382MA28657U43,91440500192754762W,913700007306705753,913700007456765902,913700006135617321,913707007884641257,91320200250456967N,913710007823210514,91371000680666729L,913710002642503020,91371000166734784G,9137100061375530XH,91370000735783157F,91371000264190434B,91370600669347065T,91440300192287527J,91440101759431420M,91420100731042634N,91440600193813525E,91445300707813507B,91340700719911235R,91330000256018570F,91330300609381595H,91330326766445257X,91310000741167473L,91650000722367867J,91610000719782242D,916101327428232411,91360500698460390M,91340100610307130N,913205006082757232,91430100760727392G,912103001190699375,91340100740872226E,914406007583005174,91420000706811358X,91440300723009295R,913100007653010244,91310000759874061E,91310000734047094H,91130100679916292B,913200001347865204,91330000668325921R,91540195754285145H,913206007691214935,914201006634595767,914201003001005805,9142010066953862X0,91420100771373833D,91420100724667038L,9142010073106629X0,91420100682300843F,914200006164176058,914201003002476548,91420111783183308C,91420100568359390C,91420100799777098J,914200007146087391,9142010073104498XQ,91420000177730287E,914201007071163060,91420100300053761B,914201005550145025,91420100731084500Y,913402007408704905,91340200762794150A,913402005785489960,913200006082793884,91420100300251645N,913205001381896946,9132020071491965XM,91320200724183068U,913202005502754040,91320200240505438W,91320200725201811T,91320200136349770P,91320200782736492H,91320211697939236T,91320282559266993J,91320200743938892R,91320200738287183E,913202007974023051,91320200060166715B,9132020072665605XK,91320200725202347B,91320200755862928J,91320205250220911F,91320200735716149R,91320200704046760T,91320200763551927E,913202000601816164,913202006816377193,913202110535042298,91320211790871547J,91320200775435667T,91320200732272706G,913202137186955428,91320200135890776N,91320200692568341T,91320211769137321E,91320205757981568F,91320200674440635K,913202006811293789,91320200741311914F,91320211250066467M,91440300770347406Y,914403007311028524,915101000833108553,916101315660088532,913502002600603688,9135020026013710XM,91350200260060034P,9135020061200896X6,913502002600846346,91350200612016388E,91350200751606855K,913502005750038518,91350200612033399C,913502007054071347,91350200612260049W,91350200155013367M,91350200260120674H,913502007054371227,91350200705466767W,91610103294241917P,91610100294239534G,9161010322061133XP,91610131628053714D,91610131628001738N,916101317974808482,91330500778299605T,91420000177583897Q,91420600714657151N,914105277474012089,913301006739591016,91440605053745575B,91330000143011639A,91440500231666168R,91350200784171077C,91650100228582422A,913300007195926252,91370600720751371J,91350505705245753B,91350000154341545Q,91330000609124409H,91330000142941287T,91110000710927126K,916300002265939457,916500007129676234,91650000722318862K,916502006978024838,916523007269460306,916529007846613320,91210100117812926M,91652800715548301W,91650000712966815D,91650100076066559G,916501004576329996,916501006827031595,91653100748663541B,916501007383763383,916542002035688270,91650100228584428B,91650200729156392B,91650000673412317X,91331081797615327C,91330300145498305G,914403007451834971,91411000780502633Q,91410724671699465T,914107006149375190,9141070068568407XM,91130400104365768G,9133038214550201X5,91420800764100001A,91330000148247018R,91320300750041506E,911500007014638920,91370000184280878R,91341800779082563U,91411000772168241N,91330200734267003C,913203007974062428,91430900750606108C,913206007448277138,91610403338742407M,91350000158164371W,91420100616400352X,913210001410496687,913210036087333842,91321003725216976F,913210007908906337,9143010077900133XT,91370000166122374N,913700007337235643,91370600752684994W,91370600737203697G,91370000720717309H,913706006894842353,91370000613431903J,913706837381687230,9137000016503468XK,913706007063003983,91321000140719551F,91370300706025381Q,91441900618333632H,912201068239984307,91210000594843987K,915115002088509874,91330000725254155R,9143070067558223X2,913100007178563164,914405001959930485,91440000618095689N,91654002564379263N,91510100740327535Y,91320200704074497B,91640000227693286K,91330100749453087D,91441881755600266B,91210000716409709T,91421200562722881P,91510900754701583H,911525007116525588,91330000745826157T,91530000725287862K,91440606740846335Y,91350200612040492E,91350000727900106T,91330204746303411D,91330000722762533U,91350500583130113U,91430000760723375M,91310000748059571K,91110000600001760P,91350000611569108K,913310007530185122,91330000743452075L,911100007263731643,9132000072058020XK,91330200704800698F,914401017083874153,91340100610307675N,913204005603281353,91500112660889685L,912102002412697996,91330100720048715X,9161000006191747XU,913707007465823505,914306007225877126,914300007656254831,91330400551779794Q,915301007134092367,915301005551100783,91530000218920600L,91530000713464526C,91530000727317703K,915300007670584000,91530000709835283M,915300002194829991,915300007098268547,915300002179235351,91530000713401509F,915300007134134380,91410000170001401D,91610133132207011Q,9113040060115569X8,91330000723629902K,91340100327991758Q,91310000586778185R,91430800186881407B,91330183679858889H,91320582714943959W,913205007514219819,91320582790874377A,91320582729023768R,913205007185439367,913310211483889459,9133000074507862XQ,9133070314730958XR,9133000072527923XB,91330000759522947D,91330600146342118G,9133000076251901XW,91331000554754592X,91330000609786138W,913310005753258189,91330481799606731M,91330000777214673R,91330503745844451F,91330000683124669E,91330723762512117A,91330000780477634Q,91330100757206158J,9133048172587440XX,913303007303249630,91330106563049270A,913300007245066803,91330000142944445H,91330000609671736A,913300007109591285,91430000796858896G,91330000742004828D,913300007308931541,91331000670275302R,91330100143956405Y,91331000255499827N,91330600609680368C,91330000704206103U,91330000704512063Y,9133000071257271X6,913310001481183122,91330683768696455K,913305007686509836,913306047498339794,913304216899850991,91330000609700859G,913300007315154541,9133000014871793XM,91330000609700795J,91331003148185689U,9133050066615054X7,91330109768216095R,91330481307761859N,91330000255072786B,91330000710969000C,913307837045812886,913301005865048038,91330000704277796X,91330800586274286A,91330400747729414J,913300001482332737,913300007664077600,91330000704676287N,913300007792873744,913301007517211839,913300001479306167,9133048114672516X7,91330000682900435M,9133070073033191X2,913300007368873961,913300001471207528,91330500763900410B,91330000609120272T,91330000741008835U,913300007463411432,91330100704290413D,913300001464759067,913306005547614262,913300007530435745,91330600753964306M,913300007829495191,913307007707246030,91330000723629566F,913306006096100281,913300007964528296,9133000025403311XB,91330000704503984N,91330000146684900A,913300006628918505,91330600146150140Y,91330000704676703E,91330000147115443M,91330000704204554C,91330000729092173R,913308025765293106,9133000074981708XL,9133000074100296XK,913300007047850454,91330600796466462H,913300001469757672,91330200704803223P,91330400X0942984X0,91330000704202137E,91330000142943469Q,9133060072360502XQ,91330500550515703T,91330211254100749G,91331023724526593X,91330000724502487G,91330000712560575G,91330000142927960N,913305007804719612,9133072670455890XG,91330681307478340A,9133000074200262XL,91330703781824255T,913310826683250245,91330000667124503L,915107007939595288,91330000726586776L,91330200698208670Y,91330000710987081Q,913304007352793803,91330000146183233T,91330500056855710M,913306837045254701,913300006096907427,913307007288998483,91330600145965997H,9133100070471153X3,91330000749843368W,913300001476445210,91330700147493495C,91330000779370442Q,91330102785327408E,91330281730145332E,913300007345233459,91330100665212665U,91330000704720655L,9133000077191496X7,91330400677231599U,913306006617396382,91330000607911599B,91330000720082446H,913310816100020466,91330000704207173A,9133110035546965XU,91330000717612987P,91330500704459485N,91330000704715960K,9133100075709503XC,91331000148144211K,91330523746336790G,91330100697072273U,91330300681666245N,913306247265987348,91330700566953812T,91330600779389434M,91330000755902563E,913304810852709304,913300007324065674,91330621755903566B,91330000754921594N,913300007043088475,913300002552164796,9133000074771866XW,91330326661705454E,91330000734522019A,91330000722765769J,91331000704716189P,91330000733811206X,913300007042034718,91330600704507918P,913310001480293875,91330621143010433H,913300001429192743,913306006628977937,913300001463546966,91330000MA2GAEH14D,91330000704721033H,91330683704477704R,91330301254496691M,91330600745085889D,91330000146150706G,9133000014616098X3,91330000146008822C,91330700776454800N,913303007793989040,913306216784286764,91330000704713738F,913308007639292214,91330700753962378R,91330000719525000X,913300006702752064,91330621721077606P,913311245623690963,913300007450544091,91330521147114918E,91330500753970802B,913300001429120051,91330000143906141R,91331082255225797Q,913300007200827022,9133000070459138X9,91330783552855277N,91330302585012778N,91331000782935301K,91330100751742531B,91330300145574611C,91330400559682372X,91330100735254191Q,91331000779358865H,91330600609661634M,91630000226882472D,91110000267130226N,91410100739082104K,91410100594879787D,91410100769490897P,91410100760248041Q,91321100718698874L,91321100761019494Q,91430000712108626U,91370000164960593R,91370300787160568U,913500007356527552,91420100177682019R,91441700197332374T,913700007357889006,913207007849886428,91350000158156419U,913500007279127050,91341300711774766H,913706007763110099,91320205704071771A,91440300754257216E,91411300742548454G,91340100743098352K,913100007514799050,91350000158150236X,91370000206951100B,914420007251062242,91500105759295238A,91520000214466447K,91220000123925847D,913301101438971341,91210000243490294P,911101086835621402,914405007224920787,91310000558762442G,91120113727536666U,91420000707079234K,91440300192420826L,91440400775088415F,914404007211055669,91440400799386302M,9144040077096114X2,914404001925952982,9144040019252393X0,91440400782988681C,9144040075452568XT,91440400665003767C,91440400192520640G,914404006183915766,9144040072510822XR,914300007808508659,914302007656224696,91430200616610317F,91430200799104619D,91430200776779744R,91430200712106524U,913703007347051654,91130300104366111M,914419006698651618,914300007121944054,913305007302929303,91442000618132806P,91330100X09172319F,91440101712408557U,913205006082844193,914401011904604026,91610000MA6TL0ET50,91440000725063471N,91330000747735638J,91510000207312079C,9110000010002371XD,911000000000192465,91100000100017045K,91100000100023728D,91100000102063897J,911100000000184548,91110000100000489L,9111000010000093XR,91110000100000948R,91110000100001035K,91110000100001043E,91110000100005220B,911100001000055386,91110000100005888C,9111000010001002XD,91110000100010433L,91110000100010660R,911100001000128855,91110000100014493P,91110000100017707H,91110000100018267J,911100001000238160,91110000100024915R,911100001011200015,9111000010169286X1,91110000102016548J,911100001055722912,9111000071092446XA,91110000710924910P,91110000710924929L,911100007109250324,91110000710925243K,911100007109279199,911100007109284282,91110000710929498G,91110000710929930X,911100007109303176,911100007109310534,91110000710931061Y,9111000071093107XN,911100007109311097,9111000071093123XX,91110000710932515R,91110000710935732K,911100007178306183,91110000717830650E,91120000103063434T,911200005534349336,91130000677356885K,9113010010459478X9,91130500784050822M,91133100MA0GBL5F38,91140000070450154H,91140000MA0L7G6D21,91140000MA0LAJ3H0K,91210000558190456G,912201011239989159,91310000128515986K,91310000132200821H,91310000132206289R,9131000032469570XM,91310000710924478P,91310000749599465B,913100007956724321,91310000MA1FL1MMXL,91310000MA1FL70B67,91310000MA1H32292H,91310115084107728R,91320000608946953Q,913205097344220935,91320509796141166A,913300001430490399,91330000734530895W,9133000079338631XM,91330109143586141L,91330300751176226P,913306811462584935,91340700151105774A,91350200154990617T,91350200260147498N,91360000158264065X,913700002671781071,913700006722499338,913716261672060009,91410000663414132K,914200001000115161,914201001776819133,91440101231235448R,91440101231247350J,91440101347466547J,9144010172502048XD,91440300746645251H,914403007675664218,91450000198229061H,91460000708866504F,915101006863154368,91510100MA68KHX35G,91510100MAACK35Q85,915300002919962735,91610000220568570K,916100007625687785,91620300224690952T,916501005991597627,91110000717843275N,913400001489746613,911200007109339563,914200007581510645,91350500717357365T,91330000143995391Q,914401017083429628,912301001275921118,91110000710931141J,91150000114392559E,9111010155140268X8,91150100114111403U,91150100743882956A,91150000797181303E,91150000MA0NLRJ076,9115010011412575X7,91150000793609990G,911500000957889956,91150100701462670A,91150000783023945D,9115010079018195X0,911500001141229700,91150000X270330054,91310107132936177G,911500007013056834,91150000MA0N4HYF5X,91150000067504705P,91150000MA0PQMHU5T,91150000MA0QG7U246,913101071337312962,91310109630360895R,913100006318655619,91310000132295701K,91310105132207185C,91310110133226572E,91310109051268760Y,913101101332266447,91310101132307282H,91310115067758342E,91310115749279533D,91310101132220662G,91310000769684149F,913101061322082417,91310000132291639W,91460200721278651T,91310118134373072F,913101098322500968,9131011313229600XR,91310109133065617J,91310113133405407T,91310000133139647N,91310101132504980J,913101171341092719,91310109631783195P,9131000073458050XB,91310120630941673A,91310104607218524U,91310000631319149J,91310000631757739E,91310000667805050M,913100005529432935,91310000132284295X,9131000013221297X9,913100003123156507,91310000MA1FL4Y718,913100007547623351,913100005601172662,91310000132201410X,91310000630245184Q,91310000132263849G,913100001322382488,91310101132214828F,913100006318635745,91310000132220312N,913100001322718147,9131000013221713XU,913100001322319278,91310000759006889A,913100001322128733,91310000132262168G,913100001322221746,91310000132228728T,913100006317558649,91310000MA1FL7ABX7,91310000MA7MGAGH5G,91310000MAC7AGQK2E,91310000MABXQWH71W,91310000132276535F,91310000MA1FL7QP6U,91310118MA1JPAB85C,91320000134795187R,913200001347595731,9132000078271658X0,91320000134767063W,91320000MA1P1ERM7T,91320000735724800G,9132000013478500X8,9132000072058717XP,91320000MA1YLUXD2W,91320000720587823R,913200001347771223,91320000134787937Q,913200001347507715,913200003238683144,91320000720585377G,9132000013475748XE,9132000077203354XH,91320000134850027D,913200003235715453,91320000MA2040GUXY,91320000MA20TXYX52,91330000142913112M,91330000758050706G,91330000671637379A,91330000798592788H,91330900307662068B,913300000683517554,913300007236299969,913300007276037692,91330000142917666G,91330000142911723D,913302005670431750,1233020075625385XG,9133020078676743XA,9133020014407480X5,91330200144055000M,913302007960219655,9133020025410298XE,9133020014409064XG,913302007900686311,91330200784303172P,91330000782926659M,91330000785683832W,91340000148943240Q,913406006775926850,91340300149861466W,913400000803136982,9134000072331410XA,913400007049015954,91340000MA2RYG6M22,91340000711778783B,9134000079644292X6,91340000148971532P,91340000705045276Y,91340000705044214B,913400001489739848,91340000148940701E,91340000MA2NKAX24Y,91340181153580560D,913406001508200390,91340400MA2RP38K42,913401001491402635,91340000670904113Y,9134000072632213X9,916100006847589897,91610700222542067H,91610000797924728K,91610301710086048M,91610000220527103Q,91610000MA6TG06P4D,91610000305718646D,91610000059672418N,9161000030577092X8,91610000052117366D,91610302713512620P,91610000220526151G,91610000MA6TG5H46J,91610000727343864B,9161000079078454X9,916100003054628635,91610000220535146F,91610000570663973E,91610000MA6TG43NXM,916100002205334589,91610000681587782E,91610000059670527U,91610000074509969A,91610000570668168K,916100005835106342,91310106780596246P,91610000220575738M,91610000220520630C,91610000056901668Y,91610113MAB0QTF62E,91610800MA70FUB2X9,91620300739622350L,916204005716275797,91620000720299995F,91620700670824180R,91620000MA72YXQ613,91620000MA748HK51R,9162000022433064XN,91620000438000013K,91620102719097851D,91620000712756631E,916200006654372581,91620000073568983Y,91620000MA73UCJ850,916200009245943712,91620000X24100305D,916200002248721900,916211227102756155,91620000745866570W,916200006654252818,916200007190339464,91620000719056611M,916200002244326626,916200002243386254,91620000719077033J,12620300438260237M,916202002246412029,91620105224526511D,9162010567083758X1,91620100224469959T,91620103224485561A,91620100296581077N,91620000MA72TBW70N,91620000224552058P,91620000MA74MFFBX6,91620000296584040E,91620000098238577P,91620000MA73J82M0G,91620000296623476P,91620000MA73WG5A4R,916328262275740092,91633300MA759FFA1W,91630000015000548J,916300007104085373,91630000226580757M,916300002265829634,916300007105860692,91633100564915871B,91630000710483537W,916300007105470788,91630000710404288D,91630000226586921N,91630000226591034H,9144030072619270XF,9163000071040638XJ,91640000694320542R,91640000MA75XH043G,91640000MA75XG8B9G,91640000710606357L,91640000MA75XJ7M0P,91640000227692945W,91640000MA75XJ33X8,91640000MACDDMB52H,91110108778635402E,9165042122906597XU,91650102228670318F,916501037817878108,91650000722328999R,91650100299945024F,91650100228603027Y,91650103228581884H,916501022286956386,91650000710883848L,91650100697822433D,91650100228713891B,916500007223141241,91650100MA77GLNN5M,91650000228593105A,91650100228580718E,91650102228670238U,91650103228695179F,91650000MA776A778F,9165010222858513XG,916500002286736663,91650000091941411H,9165010059916844X8,91650100599166567D,91650100666655871D,91650100076066313E,9165010022872948XD,916500007318392722,91650100228595231A,91650000MA79HAN55K,91650000MA7JYR3K9T,916500002285806033,91650109795790391X,91410000706780942L,914100006987322024,914100007982385511,91410000747444427A,91410300171076114N,91410000415800253F,91410000712649924H,91410000693505019R,914100005817422124,91410000698736553A,91410000MA9LE1618G,91410000699963723F,91410000MA9LNHTF1C,9114000075725677X0,91140000701002121R,911400001100144545,91140000MA0HL5WN2L,911400007460236201,91140000110053488C,91140100MA0H5Y7R8Q,91140000MA0HLAAE2A,911400006686150485,91140000112360000T,91140000678191736U,91140000110112812M,91140000110014497J,91140000110014112R,91140000694272341C,91149900MA0KYJFJ38,91149900MA0L1MLR6W,91140700MA0LB2T300,91152900MA0NBNHB9X,911529000539164509,91150802701437045J,91150800MA0MWLMGXC,911508027901707321,91150800MA0MWAC131,91150802720185680P,911508027012747412,91150204747901974B,9115020072010852XH,911502005581423315,91150200MA0N121B5U,91150200318506394X,91150204736100047A,911502003289900529,911502027830054053,91150400676905272F,91150400MA0MY24T05,911504021148077345,91150400MA0N0N7J0Q,91150400MA0N1KDY95,91150600MA0MYRY61P,911501007013703625,91150100MA0MYCFT85,91150100701339816T,91150100MA0Q6LBR1F,91150100783008323W,9115010008518731XJ,91150121MA0Q1P3N1B,91150100341438852Q,91150700MA0N3U2C7N,911507005788814577,9115070075255329X1,911507007761138545,911507001151975447,911505007332895859,911503003414432561,911503007012609990,91150302701260876U,91150302114670221P,91150900MA0PXXPT6X,91150900MA0PXPJD5M,91152500397354738G,91152500790165976M,91152200341286924J,91310118677856310T,914100007251292747,91330100662324231U,9111000066990444XF,91330000148868586D,91330000723628803R,91220000729540909F,91350200705487306K,91220000702425994U,91330901148716005T,91440300192255939T,91540000710905111C,91510000202285163Q,91451100711427393C,91140000267171001C,91420000722084584J,91340100754889192Q,91510100224367821D,9135010078216907X1,91610131556950212T,91320000100026961J,913300001460375783,914403007451740990,9133000071095874X3,914200007220290598,91440300665899831W,911400007460463205,91320211100013394P,91340000148975314D,914101007324826746,91220201124496079Q,9134020073498415XP,91441200725995439Y,914300007533850216,91310115729533231F,91370000163446410B,914103006148088992,91530000216521606P,9133000070471161XA,913710007060840744,91110000802062406U,91321200608812146K,91330200610257495J,91321000703903783L,91310000695826254X,91320506567813635P,91220101606092819L,91310000751873021H,91360106784146840K,91520000214433792D,91340100664238732X,91340500733034312N,915101845722876769,91510000725526042X,91320400747314251P,91370600746569906J,914303007483865809,91440000197576715Y,915100007422540773,91110000192472028J,91340121674200463H,91440101190484084A,91220101605902656F,91420112768092336G,911100007226014149,91330500720068476A,915203002147892034,914403001923528003,913501007438096369,914206001793145000,91130100236018805C,913200007455797746,911100006835529627,91440500723817938W,91230800127590757N,91340000704920454F,9137020071802356XK,91441900617994922G,911100007178710060,91510600205366604X,915200007366464537,913202007265601380,91330100762017394J,91330100742001328G,91330000758062811X,911101086723891430,913700007207576938,91430200712137961U,91110000710929148A,91310000134616599A,91320213061850324J,91450300708618439A,91330600739910598X,91110000795997288B,91110000192184333K,91441900618367138U,91330200610271537C,91310000631899761Q,914303007170467196,91445200077874291G,914400007250669553,91320600138299578A,91330000254847375U,91350200155052227K,91320200772038068L,91360500716575125F,91440300618888515F,91330000704277673W,91370000706385950B,91610103294241490X,913101156745626329,91310000133727203Q,91440400725466481C,911100002717519818,91110000100028633H,915001077500638601,915200006707225551,91440300618884987N,913202006079522354,913401006709173443,913702002646064362,913600007055083069,91340100348841353K,911200001030705897,91320000734417390D,913203001347934993,91320000249707722B,91310000132210544K,9135020076928783XA,9151060020515584XN,9132110075321015XF,913603007165007488,9132020072653508XD,91440300783905518J,913201921349556628,914102002681294387,91320300660802674Q,913302007342813661,91450800198227509B,9153000071947854XF,9111000010196866XH,9165000056438859XD,913300007743880298,9111000070024070XK,91320000743141824Y,913602007841010956,91500000202802570Y,91120000700492827M,91341100704965812G,913213001423289417,9132060072521804X6,91430500763263554A,91330000142943303A,91230200710935767F,91130300105390439K,911100008020705889,91310000607200164Q,913502007516215965,91220000123962584G,91310000745611834X,91420200714697006M,91110000100017336T,91370000168130028J,91310000133501183B,916100006611776206,913400001490341376,91442000617979677N,91410900744099904P,91430600186201870U,91610131710183542G,91210106243406830Q,91620000224371505Q,91110000710934537G,91350000158166297A,9151010075598305X1,913716007986665561,91320900743731816A,91150000720180740Y,913500001563372595,914503001991037270,9142010017767908XR,91330000704690900L,91320000134775688R,91441900673133772K,91410000706784652H,91320200578117344H,91610000745016111K,91110000710935257Q,915000002028133840,91610000220594875E,916200007202575254,91321200141076367Y,91340207664238230M,91110000700148065Y,914403007084294519,91360500674954556L,91370000267171810H,91510100201958223R,911100007109323200,914403001924223896,914403007917461234,91310000132212291W,91120000734546571Y,91370000706206553L,91410000733861107G,9131000013222900X9,9131000013221035XN,915100007758164357,913302007503813672,91610113628001682H,91440101100006899U,91341700MA2N8L8704,91130100700714215X,913100001322084958,91440800617803532R,91620000712759170Q,ZZSN23011300000009,91320000751254554N,91420000271756344P,91330000146884443G,91310000425011944Y,91130000673224391T,916103007099018935,91530000709829203J,91620000224336881T,91320500720523600H,914406007192139717,9134000070503581XA,91310000132203723P,91510000206956235C,91330000720085639T,91430000183898967C,9132000074557990XP,913700007254238017,91440101190517616D,913708007657630504,91410100170033534A,911410006838069266,91430000185034687R,91230199723661865E,91330000143839073P,91321100608834062C,91320000703971102J,911100007226144851,91330000710959275N,91330000142917121G,914401017397031187,914100001700014285,91340100610300772G,91330000712550473W,91370200163621493Q,9142000070689187XB,914500001982250954,914502001982303373,913200007194121453,91500000202819532B,9131000013220921X6,913100006316131618,914401830545413557,91620000224529093P,91371300672231450Y,91440500192983581M,91370000163099420E,91140000110055862W,91110000717825966F,914403002793464898,91120103103368983M,91310000710933112E,913100005791928139,91210200604862592R,91610000727342693Q,914404001925268319,91610000713550723T,91440300192238549B,91440300192241158P,913100006311887755,91350200158163213A,91310116662478847M,91450000715182397J,91440400628053925E,91110000625909986U,91310112792759719A,912101002434901556,916500002296811666,91340000719986552R,914300001837784984,911400001123599660,91350000158160688P,91650000132278661Y,91520000714303759X,91430000183774980R,913300007046976861,91310000132202296L,913700001630684138,91320000670145129U,91510000206152800A,91120000712830811X,91510100725369155J,916500002286626765,91320000714091899R,91440700193957385W,912200007911418611,91330100719572130W,911100007109338846,91330100143200149A,914403002793630194,91230100127046743W,911311007216760190,91650000712958321C,91340000730032602U,913700002671842400,91430300755843372T,91370000729270531X,9144020019153918XA,91310000132210595U,915000007093295592,911300007468680177,91130400106900891W,915111002069551289,91360600759986995D,913700007409658444,915400002196726375,91320500746203699Q,91350100739548277W,91370000588768482Y,914405002311310326,91513200211352460H,913417006836379072,91420000179120511T,91211200201909093K,91430300722573708K,91450200715187622B,91440000710924128L,91360000705515290C,91440101761932988M,91110000710935329H,913100001322300861,91140000725909617E,91430700183811016L,9133000014553840X5,9131000070327821X9,911100001000127624,91610131726285914J,91440400721169041N,91330100742929345R,91100000710932021X,9113020070071264XQ,91310000132207732J,91370000163098284E,913209001401417456,911100007109351850,91330000732023371N,916500009287328820,91450000791346584E,91330000720084441G,91440101633208952W,91370000164960403T,91410000170011642P,911100007334480727,91320000741339087U,9142010072466171X0,9144050061755920X4,91310000132234925P,914403001922545226,913300001461463526,91340000731686376P,914100007126456409,91650000710892189L,915300002919937260,913207007382577341,914403006188988448,91140000701012581E,9137068166139756X1,91610300220533749U,91320582MA1NU2QE9N,91320100738866409D,913403007199576633,91320200720584462Q,91330701254999838P,914203005654858771,91320100631402444M,913100006072212052,91510600205363163Y,91130200721620963C,91110000740091307R,913300007441437848,91330200144565596J,916400007749178406,913404007109235209,'
gn_result = aaa.split(',')
gn_social_list = [item for item in gn_result]
print('=======') print('=======')
#将数据插入到redis中 #将数据插入到redis中
for item in gn_social_list: for item in gn_social_list:
r.rpush('NewsEnterprise:gnqy_socialCode', item) # r.rpush('NewsEnterprise:gnqy_socialCode', item)
# r.rpush('NewsEnterprise:gnqybc_socialCode', item) r.rpush('NewsEnterprise:gnqybc_socialCode', item)
# for item in gw_social_list: # for item in gw_social_list:
# r.rpush('NewsEnterprise:gwqy_socialCode', item) # r.rpush('NewsEnterprise:gwqy_socialCode', item)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
import pandas as pd import pandas as pd
import requests import requests
from goose3 import Goose from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from base.smart.entity import * sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
from base.smart.smart_extractor_utility import SmartExtractorUtility from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用 # goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree from lxml import etree
from lxml.html import HtmlElement from lxml.html import HtmlElement
...@@ -135,6 +138,16 @@ class SmartExtractor: ...@@ -135,6 +138,16 @@ class SmartExtractor:
return self.get_extraction_result(article, link_text) return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....] #url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def extract_by_url_test(url_list,list_info_all): def extract_by_url_test(url_list,list_info_all):
# 测试:按URL采集 # 测试:按URL采集
......
# 根据信用代码获取天眼查id # 根据信用代码获取天眼查id
import json import json
import random import random
import sys
import time import time
import pymysql import pymysql
import requests import requests
from base.BaseCore import BaseCore sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5 requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
......
...@@ -6,11 +6,12 @@ import requests, time, pymysql ...@@ -6,11 +6,12 @@ import requests, time, pymysql
import jieba import jieba
import sys import sys
from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
# from base.BaseCore import BaseCore # from base.BaseCore import BaseCore
# from base.smart import smart_extractor # from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base') sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore import BaseCore
from smart import smart_extractor from smart import smart_extractor
import urllib3 import urllib3
...@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor ...@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+' taskType = '企业动态/天眼查/补采20W+'
def reqDetailmsg(url,headers):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,1):
try:
response=requests.get(url=url,headers=headers,timeout=8,verify=False)
response.encoding = response.apparent_encoding
htmltext=response.text
except Exception as e:
htmltext=''
log.info(f'{url}---详情请求失败--{e}')
if htmltext:
log.info(f'{url}---详情请求成功')
break
return htmltext
def beinWork(tyc_code, social_code,start_time): def beinWork(tyc_code, social_code,start_time):
time.sleep(3) time.sleep(3)
...@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析 # 开始进行智能解析
# lang = baseCore.detect_language(title) # lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang) # smart = smart_extractor.SmartExtractor(lang)
#带标签正文 # req = requests.get(url=link,headers=headers,timeout=10)
contentText = smart.extract_by_url(link).text # html = BeautifulSoup(req.content,'html.parser')
#不带标签正文 raw_html = reqDetailmsg(link,headers)
content = smart.extract_by_url(link).cleaned_text if raw_html:
# time.sleep(3)
# soup = BeautifulSoup(raw_html, 'html.parser')
try:
article = smart.extract_by_html(raw_html)
content = article.cleaned_text
contentText = article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except Exception as e: except Exception as e:
contentText = '' contentText = ''
if contentText == '': if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}') log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败' e = '获取正文失败'
...@@ -281,7 +312,7 @@ def doJob(): ...@@ -281,7 +312,7 @@ def doJob():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode') social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
#social_code = '91440300665899831W' # social_code = '913205007764477744'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num): ...@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': year_url, # 原文链接 'sourceAddress': year_url, # 原文链接
'summary': '', 'summary': '',
'title': name_pdf.replace(',pdf', ''), 'title': name_pdf.replace('.pdf', ''),
'type': 1, 'type': 1,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': year 'year': year
...@@ -260,7 +260,7 @@ if __name__ == '__main__': ...@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode') # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '913412007050444417' social_code = '91330000734507783B'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
...@@ -33,13 +33,14 @@ def getRequest(url,headers): ...@@ -33,13 +33,14 @@ def getRequest(url,headers):
return json_data return json_data
# 严重失信 # 严重失信
def dishonesty(): def dishonesty(headers,com_name,social_code):
list_dishonesty = []
param = { param = {
'tableName':'credit_zgf_fr_sxbzxr', 'tableName':'credit_zgf_fr_sxbzxr',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': '1', 'page': '1',
'pageSize': '10' 'pageSize': '10'
} }
...@@ -50,14 +51,14 @@ def dishonesty(): ...@@ -50,14 +51,14 @@ def dishonesty():
if json_data['status'] == 1: if json_data['status'] == 1:
pass pass
total_size = json_data['data']['totalSize'] total_size = json_data['data']['totalSize']
for page in total_size: for page in range(1,total_size+1):
param_page = { param_page = {
'tableName': 'credit_zgf_fr_sxbzxr', 'tableName': 'credit_zgf_fr_sxbzxr',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': f'{page}', 'page': page,
'pageSize': '10' 'pageSize': '10'
} }
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10' url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...@@ -67,7 +68,7 @@ def dishonesty(): ...@@ -67,7 +68,7 @@ def dishonesty():
pass pass
info_list = json_data['data']['list'] info_list = json_data['data']['list']
for info in info_list: for info in info_list:
entity = info['entity'] entity = info
iname = entity['iname'] # 失信被执行人姓名/名称 iname = entity['iname'] # 失信被执行人姓名/名称
cardnumber = entity['cardnumber'] # 组织机构代码 cardnumber = entity['cardnumber'] # 组织机构代码
court_name = entity['court_name'] # 执行法院 court_name = entity['court_name'] # 执行法院
...@@ -83,15 +84,34 @@ def dishonesty(): ...@@ -83,15 +84,34 @@ def dishonesty():
performed_part = entity['performed_part'] # 已履行部分 performed_part = entity['performed_part'] # 已履行部分
unperform_part = entity['unperform_part'] # 未履行部分 unperform_part = entity['unperform_part'] # 未履行部分
dataSource = info['dataSource'] # 数据来源 dataSource = info['dataSource'] # 数据来源
dic_dishonesty = {
'失信被执行人姓名/名称': iname,
'组织机构代码':cardnumber,
'执行法院':court_name,
'省份':area_name,
'执行依据文号':case_code,
'立案时间':reg_date,
'案号':gist_cid,
'做出执行依据单位':gist_unit,
'生效法律文书确定的义务':duty,
'被执行人的履行情况':performance,
'失信被执行人行为具体情形':disreput_type_name,
'发布时间':publish_date,
'已履行部分':performed_part,
'未履行部分':unperform_part,
'数据来源':dataSource
}
list_dishonesty.append(dic_dishonesty)
return list_dishonesty
# 行政处罚 # 行政处罚
def punish(): def punish(headers,com_name,social_code):
list_punish = []
param = { param = {
'tableName':'credit_xyzx_fr_xzcf_new', 'tableName':'credit_xyzx_fr_xzcf_new',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': '1', 'page': '1',
'pageSize': '10' 'pageSize': '10'
} }
...@@ -106,15 +126,16 @@ def punish(): ...@@ -106,15 +126,16 @@ def punish():
if total_size > 0: if total_size > 0:
pass pass
else: else:
log.info() log.info(f'该企业{com_name}无行政处罚信息')
for page in total_size: return list_punish
for page in range(1,total_size+1):
param_page = { param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new', 'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': f'{page}', 'page': page,
'pageSize': '10' 'pageSize': '10'
} }
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10' url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...@@ -141,6 +162,88 @@ def punish(): ...@@ -141,6 +162,88 @@ def punish():
cf_sjly = entity['cf_sjly'] # 数据来源 cf_sjly = entity['cf_sjly'] # 数据来源
cf_sjlydm = entity['cf_sjlydm'] # 数据来源单位统一社会信用代码 cf_sjlydm = entity['cf_sjlydm'] # 数据来源单位统一社会信用代码
dic_punish = {
'行政处罚决定书文号':cf_wsh,
'处罚类别':cf_cflb,
'处罚决定日期':cf_jdrq,
'处罚内容':cf_nr,
'罚款金额(万元)':cf_nr_fk,
'没收违法所得、没收非法财物的金额(万元)':cf_nr_wfff,
'暂扣或吊销证照名称及编号':cf_nr_zkdx,
'违法行为类型':cf_wfxw,
'违法事实':cf_sy,
'处罚依据':cf_yj,
'处罚机关':cf_cfjg,
'处罚机关统一社会信用代码':cf_cfjgdm,
'数据来源':cf_sjly,
'数据来源单位统一社会信用代码':cf_sjlydm
}
list_punish.append(dic_punish)
return list_punish
# 经营异常
def abnormal(headers,com_name,social_code):
list_abhormal = []
param = {
'tableName': 'credit_scjdglzj_fr_ycjyml',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': '1',
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data = getRequest(url, headers)
# print(json_data)
if json_data['status'] == 1:
pass
# 总条数
total_size = json_data['data']['totalSize']
if total_size > 0:
pass
else:
log.info()
for page in total_size:
param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': page,
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data = getRequest(url, headers)
if json_data['status'] == 1:
pass
info_list = json_data['data']['list']
for entity in info_list:
entname = entity['entname'] # 企业名称
uniscid = entity['uniscid'] # 社会统一信用代码
lerep = entity['lerep'] # 法定代表人
pripid = entity['pripid'] # 主体身份代码
regno = entity['regno'] # 注册号
specausename = entity['specausename'] # 列入经营异常名录原因类型名称
abntime = entity['abntime'] # 设立日期
decorgname = entity['decorgname'] # 列入决定机关名称
dataSource = entity['dataSource'] # 数据来源
dic_abnormal = {
'企业名称':entname,
'社会统一信用代码':uniscid,
'法定代表人':lerep,
'主体身份代码':pripid,
'注册号':regno,
'列入经营异常名录原因类型名称':specausename,
'设立日期':abntime,
'列入决定机关名称':decorgname,
'数据来源':dataSource
}
list_abhormal.append(dic_abnormal)
return list_abhormal
if __name__=='__main__': if __name__=='__main__':
...@@ -154,16 +257,18 @@ if __name__=='__main__': ...@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"' 'sec-ch-ua-platform': '"Windows"'
} }
type_list = ['严重失信主体名单','行政管理'] com_name = '石家庄交投集团工程服务有限责任公司'
com_name = '' social_code = '91130100MA7EK14C8L'
social_code = '' # list_dishonesty = dishonesty(headers,com_name,social_code)
dishonesty() # print(list_dishonesty)
punish() list_punish = punish(headers,com_name,social_code)
print(list_punish)
# abnormal(headers,com_name,social_code)
# 报告链接 # 报告链接
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}' # url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json = getRequest(url_report, headers) # report_json = getRequest(url_report, headers)
reportNumber = report_json['data']['reportNumber'] # reportNumber = report_json['data']['reportNumber']
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}' # pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30) # respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
......
...@@ -58,8 +58,8 @@ class Tycdt(object): ...@@ -58,8 +58,8 @@ class Tycdt(object):
def doJob(self): def doJob(self):
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode') social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code = '913205002517479347' # social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
......
...@@ -50,7 +50,7 @@ if __name__=="__main__": ...@@ -50,7 +50,7 @@ if __name__=="__main__":
opt.add_experimental_option("excludeSwitches", ["enable-automation"]) opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging']) opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False) opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe' opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe' chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
......
import datetime
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.info(dic_result)
return False
def getRequest(url,headers):
req = requests.get(url=url, headers=headers, timeout=30)
if req.status_code == 200:
pass
soup = BeautifulSoup(req.content, 'html.parser')
return soup
def deletep(soup,attribute_to_delete,value_to_delete):
# 查找带有指定属性的P标签并删除
p_tags = soup.find_all('p', {attribute_to_delete: value_to_delete})
for p_tag in p_tags:
p_tag.decompose()
def deletek(soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
if __name__=='__main__':
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Cookie':'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid%3D744929620-1698112534-%26ntime%3D1698112562; wdlast=1698112562',
'Host':'www.qstheory.cn',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url = 'http://www.qstheory.cn/qs/mulu.htm'
soup_report = getRequest(url,headers)
report_list = soup_report.find_all('div', class_='col-sm-3')
for book in report_list:
href = book.find('div', class_='booktitle').find('a')['href']
year = book.find('div', class_='booktitle').find('a').text
soup_href = getRequest(href,headers)
period = soup_href.find('div', class_='highlight')
deletep(period,'align','center')
deletek(period)
period_list = period.find_all('p')
for p in period_list:
period_href = p.find('a')['href']
period_title = p.find('a').text
soup_news = getRequest(period_href,headers)
deletep(soup_news, 'align', 'center')
deletek(soup_news)
title_list = soup_news.select('div[class="highlight"]>p')[1:]
for new in title_list:
try:
deletek(new)
try:
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except:
continue
if len(author)>4:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if '(' in author or '本刊' in author or '国家' in author \
or '中共' in author or '记者' in author or '新闻社' in author \
or '党委' in author or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
except:
continue
soup_new = getRequest(new_href,headers)
deletek(soup_new)
deletep(soup_new, 'style', 'TEXT-ALIGN: center')
result = soup_new.find('div', class_='inner')
if result:
pass
else:
continue
span_list = result.find_all('span')
source = span_list[0].text.replace('来源:', '').strip('\r\n')
pub_time = span_list[2].text.strip('\r\n')
content = soup_new.find('div', class_='highlight').text
paserUrl(soup_new, new_href)
contentWithTag = soup_new.find('div', class_='highlight')
nowDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dic_news = {
'sid': '1716996740019585025',
'title': new_title,
'source': "16",
'origin': source,
'author': author,
'publishDate': pub_time,
'content': content,
'contentWithTag': str(contentWithTag),
'sourceAddress': new_href,
"createDate": nowDate
}
# log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论