提交 fd3e2672 作者: liuweigang

采集代码更新7

上级 019f8b31
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
...@@ -47,12 +47,12 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i ...@@ -47,12 +47,12 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
// System.out.println("——————++++++++++++——————==="); // System.out.println("——————++++++++++++——————===");
try { // try {
consumerPartition(); // consumerNoPartition();
} catch (Exception e) { // } catch (Exception e) {
consumerPartition(); // consumerNoPartition();
} // }
// loadSiteMsgLoc(); loadSiteMsgLoc();
} }
public void consumerPartition (){ public void consumerPartition (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
...@@ -85,6 +85,33 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i ...@@ -85,6 +85,33 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i
} }
} }
public void consumerNoPartition (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
consumer.commitSync();
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++");
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}
}
public void loadSiteMsgLoc() { public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH; String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath); System.out.println(filepath);
......
...@@ -134,14 +134,16 @@ public class BaiduSearchThread implements Runnable { ...@@ -134,14 +134,16 @@ public class BaiduSearchThread implements Runnable {
String charset = "utf-8"; String charset = "utf-8";
// Long orgId = Long.parseLong("2022082801"); // Long orgId = Long.parseLong("2022082801");
// Long orgId = Long.parseLong("2022090301"); //企业 // Long orgId = Long.parseLong("2022090301"); //企业
// Long orgId = Long.parseLong("2022090901"); //企业2
Long orgId = Long.parseLong("2022091301"); //企业3
// Long orgId = Long.parseLong("20220903022");//机器 // Long orgId = Long.parseLong("20220903022");//机器
Long orgId = Long.parseLong("20220905022");//智能 // Long orgId = Long.parseLong("20220905022");//智能
Long tid = Long.parseLong("20220905022"); Long tid = Long.parseLong("2022091301");
String proxyid=Constants.PROXY; String proxyid=Constants.PROXY;
if(proxyid.equals("1")) { if(proxyid.equals("1")) {
CatchWebNews(RecorderUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid),kWord); CatchWebNews(RecorderUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid),kWord);
}else { }else {
for (int i = 0; i <30; i++) { for (int i = 0; i <10; i++) {
String urla = url1.replace("[keyword]",kWord); String urla = url1.replace("[keyword]",kWord);
// urla = urla.replace("[startTime]",startTime); // urla = urla.replace("[startTime]",startTime);
// urla = urla.replace("[endTime]",endTime); // urla = urla.replace("[endTime]",endTime);
......
...@@ -208,7 +208,8 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -208,7 +208,8 @@ public class MetaBaiduSearchThread implements Runnable {
// 判断该网址是否存在于缓存池中 // 判断该网址是否存在于缓存池中
String orgId = String.valueOf(keywordMsg.getWordsCode()); String orgId = String.valueOf(keywordMsg.getWordsCode());
try { try {
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress()); // boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
boolean sismember = JedisUtil.sismember("baidutest::"+orgId, cwbm.getSourceaddress());
if (sismember) { if (sismember) {
log.info("百度采集信息重复:" + cwbm.getTitle() + " :" + cwbm.getSourceaddress()); log.info("百度采集信息重复:" + cwbm.getTitle() + " :" + cwbm.getSourceaddress());
repeat++; repeat++;
...@@ -384,7 +385,8 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -384,7 +385,8 @@ public class MetaBaiduSearchThread implements Runnable {
// System.out.println(docInfo.getContentNoTag()); // System.out.println(docInfo.getContentNoTag());
} }
// 加入缓存池中 // 加入缓存池中
JedisUtil.sadd("baidu::"+orgId, cwbm.getSourceaddress()); // JedisUtil.sadd("baidu::"+orgId, cwbm.getSourceaddress());
JedisUtil.sadd("baidutest::"+orgId, cwbm.getSourceaddress());
// JedisUtil.setString(Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress(),"1",-1); // JedisUtil.setString(Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress(),"1",-1);
System.out.println("加入缓存池"); System.out.println("加入缓存池");
Thread.sleep(5000); Thread.sleep(5000);
...@@ -410,29 +412,18 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -410,29 +412,18 @@ public class MetaBaiduSearchThread implements Runnable {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中 // 判断该网址是否存在于缓存池中
String orgId = String.valueOf(keywordMsg.getWordsCode()); String orgId = String.valueOf(keywordMsg.getId());
try { try {
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress()); String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
if (sismember) { if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
log.info("百度采集信息重复:" + cwbm.getTitle() + " :" + cwbm.getSourceaddress()); log.info(cwbm.getSourceaddress()+" 数据重复");
repeat++; repeat++;
continue; continue;
} }
} catch (Exception e) { }catch (Exception e){
log.info("redis获取信息失败"); log.info("redis获取信息失败");
} }
// try {
// String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
// if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
// log.info(cwbm.getSourceaddress()+" 数据重复");
// repeat++;
// continue;
// }
// }catch (Exception e){
// log.info("redis获取信息失败");
// }
String infourl = cwbm.getSourceaddress(); String infourl = cwbm.getSourceaddress();
String infodata = ""; String infodata = "";
String charset = ""; String charset = "";
......
...@@ -410,6 +410,12 @@ ...@@ -410,6 +410,12 @@
<groupId>org.assertj</groupId> <groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId> <artifactId>assertj-core</artifactId>
</dependency> </dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-codec</artifactId>
<version>4.1.34.Final</version>
<scope>compile</scope>
</dependency>
</dependencies> </dependencies>
......
...@@ -10,6 +10,8 @@ import com.zzsn.generation.Constants; ...@@ -10,6 +10,8 @@ import com.zzsn.generation.Constants;
import com.zzsn.generation.FileUtil; import com.zzsn.generation.FileUtil;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.job.KafkaConsumerJob; import com.zzsn.job.KafkaConsumerJob;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.clients.consumer.KafkaConsumer;
...@@ -25,8 +27,10 @@ import org.springframework.context.ConfigurableApplicationContext; ...@@ -25,8 +27,10 @@ import org.springframework.context.ConfigurableApplicationContext;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
@Slf4j
@SpringBootApplication(scanBasePackages="com.zzsn") @SpringBootApplication(scanBasePackages="com.zzsn")
//@ServletComponentScan //@ServletComponentScan
//@MapperScan(basePackages = "com.zzsn.dao") //@MapperScan(basePackages = "com.zzsn.dao")
...@@ -46,12 +50,16 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -46,12 +50,16 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
@Override @Override
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
try { // try {
loadSiteMsg(); // loadSiteMsg();
} catch (Exception e) { // } catch (Exception e) {
loadSiteMsg(); // loadSiteMsg();
} // }
// loadSiteMsgLoc(); // try {
// loadSiteFitler();
// } catch (Exception e) {
// loadSiteFitler();
// }
// loadSiteMsgLoc2(); // loadSiteMsgLoc2();
// loadSiteMsgLoc3(); // loadSiteMsgLoc3();
} }
...@@ -98,6 +106,53 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -98,6 +106,53 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// loadSiteMsg(); // loadSiteMsg();
} }
} }
public void loadSiteFitler(){
try{
String filepath= Constants.IMGPATH;
System.out.println(filepath);
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
while(true){
try {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300);
consumer.commitSync();
if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) {
System.out.println("kafka消息:" + record.value().toString());
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
String infoSourceCode = siteMsgTemple.getInfoSourceCode();
if(StringUtils.isNotEmpty(infoSourceCode) && allLines.contains(infoSourceCode)){
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
}
}
}catch (Exception e){
continue;
}
}
}catch (Exception e){
System.out.println(e.getMessage());
System.out.println("程序异常+++++");
try {
Thread.sleep(30000);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
// loadSiteMsg();
}
}
public void loadSiteMsgLoc() { public void loadSiteMsgLoc() {
String filepath= Constants.IMGPATH; String filepath= Constants.IMGPATH;
System.out.println(filepath); System.out.println(filepath);
...@@ -112,8 +167,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -112,8 +167,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
siteMsgTemple.setYnDynamicCrawl(1); // siteMsgTemple.setYnDynamicCrawl(1);
siteMsgTemple.setYnSnapshot("1"); // siteMsgTemple.setYnSnapshot("1");
siteThread.siteMsgTemple = siteMsgTemple; siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler(); siteThread.crawler();
}catch (Exception e){ }catch (Exception e){
...@@ -133,30 +188,30 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -133,30 +188,30 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1560150270181019650\",\n" + " \"id\": \"1534682499419820034\",\n" +
" \"infoSourceCode\": \"IN-20220818-0011\",\n" + " \"infoSourceCode\": \"IN-20220609-47193\",\n" +
" \"webSiteName\": \"一带一路-项目周报\",\n" + " \"webSiteName\": \"雪佛龙\",\n" +
" \"siteName\": \"一带一路-项目周报\",\n" + " \"siteName\": \"雪佛龙-新闻发布\",\n" +
" \"siteUri\": \"https://www.yidaiyilu.gov.cn/info/iList.jsp?cat_id=11432\",\n" + " \"siteUri\": \"https://www.chevron.com/investors/press-releases\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": \"2\",\n" +
" \"language\": null,\n" + " \"language\": \"en\",\n" +
" \"checkedList\": null,\n" + " \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" + " \"hisUriExp\": \"https://www\\\\.chevron\\\\.com/investors/press-releases\\r\\nhttps://www.chevron.com/investors/press-releases\",\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": \"0\",\n" +
" \"listUrl\": null,\n" + " \"listUrl\": \"https://www.chevron.com/investors/press-releases\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"2\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": \"\",\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": \"\",\n" +
" \"informationPublishDate\": \"span\",\n" + " \"informationPublishDate\": \"\",\n" +
" \"informationSource\": null,\n" + " \"informationSource\": \"\",\n" +
" \"infoBlockPosition\": \"ul[class=\\\"commonList_dot\\\"]>li\",\n" + " \"infoBlockPosition\": \"//table[class=\\\"nirtable news-table\\\"]//tr\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \".//a/@href\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":6,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": 3,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
...@@ -164,19 +219,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -164,19 +219,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"main_content_title\\\"]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>span[class=\\\"ccbnTtl\\\"]>div[class=\\\"field__item\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"szty\\\"]>span:contains(时间)</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>meta[name=\\\"pubdate\\\"]</exp><attr>content</attr></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>div[class=\\\"szty\\\"]>span:contains(来源)</exp></origin>\",\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>*.div[class=\\\"node__content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":6,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":6,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -185,7 +240,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -185,7 +240,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" + " \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 1,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
...@@ -201,8 +256,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -201,8 +256,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"05 23 14 1/7 * ?\",\n" + " \"cron\": \"17 2 0/10 * * ?\",\n" +
" \"ynSnapshot\": \"0\"\n" + " \"ynSnapshot\": null\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
......
...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
// siteMsgTemple.setVerifyType("1"); siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
......
...@@ -2,10 +2,7 @@ package com.zzsn.api; ...@@ -2,10 +2,7 @@ package com.zzsn.api;
import cn.hutool.core.date.DateTime; import cn.hutool.core.date.DateTime;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.paser.WebContentPaserByCss; import com.zzsn.crawler.paser.*;
import com.zzsn.crawler.paser.WebContentPaserByJsonXpath;
import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -43,11 +40,11 @@ public class SiteInfoVerify{ ...@@ -43,11 +40,11 @@ public class SiteInfoVerify{
//判断解析表达式类型 //判断解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSSVerify(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapthVerify(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析 }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath(); WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
...@@ -55,7 +52,10 @@ public class SiteInfoVerify{ ...@@ -55,7 +52,10 @@ public class SiteInfoVerify{
}else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析 }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegularVerify(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("4")){//智能解析
WebContentPaserByIntellige webContentPaserByIntellige=new WebContentPaserByIntellige();
metaSearchList = webContentPaserByIntellige.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
} }
VerifyResult verifyResult=new VerifyResult(); VerifyResult verifyResult=new VerifyResult();
if(metaSearchList.size()>0) { if(metaSearchList.size()>0) {
...@@ -95,12 +95,11 @@ public class SiteInfoVerify{ ...@@ -95,12 +95,11 @@ public class SiteInfoVerify{
//判断解析表达式类型 //判断解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSSVerify(urlList, charset, siteMsgTemple);
//获取资讯详情信息 根据标签解析
}else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapthVerify(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析 }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath(); WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
...@@ -108,7 +107,10 @@ public class SiteInfoVerify{ ...@@ -108,7 +107,10 @@ public class SiteInfoVerify{
}else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析 }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegularVerify(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("4")){//智能解析
WebContentPaserByIntellige webContentPaserByIntellige=new WebContentPaserByIntellige();
metaSearchList = webContentPaserByIntellige.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
} }
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
...@@ -117,23 +119,23 @@ public class SiteInfoVerify{ ...@@ -117,23 +119,23 @@ public class SiteInfoVerify{
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByCss.catchVerifyWebNewsByCSS(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByCss.catchVerifyWebNewsByCSS(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
}else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByXpath.catchVerifyWebNewsByXpath(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByXpath.catchVerifyWebNewsByXpath(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
}else if(siteMsgTemple.getDetailExpressionType().equals("1")){//jsonpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("1")){//jsonpath解析
WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath(); WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByJsonXpath.catchVerifyWebNewsByJsonPath(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByJsonXpath.catchVerifyWebNewsByJsonPath(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
}else if(siteMsgTemple.getDetailExpressionType().equals("0")){//正则解析 }else if(siteMsgTemple.getDetailExpressionType().equals("0")){//正则解析
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
docInfoList = webContentPaserByRegular.catchVerifyWebNewsByRegular(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByRegular.catchVerifyWebNewsByRegular(metaSearchList, siteMsgTemple);
}else if(siteMsgTemple.getDetailExpressionType().equals("4")){//智能解析
WebContentPaserByIntellige webContentPaserByIntellige=new WebContentPaserByIntellige();
docInfoList = webContentPaserByIntellige.catchVerifyWebNewsByIntellige(metaSearchList, siteMsgTemple);
}
log.info("本次获取详情: "+docInfoList.size()+"个"); log.info("本次获取详情: "+docInfoList.size()+"个");
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+ DateTime.now()+"采集条数:"+docInfoList.size()); log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+ DateTime.now()+"采集条数:"+docInfoList.size());
}
VerifyResult verifyResult=new VerifyResult(); VerifyResult verifyResult=new VerifyResult();
if(docInfoList.size()>0) { if(docInfoList.size()>0) {
......
...@@ -4,10 +4,7 @@ import cn.hutool.core.date.DateTime; ...@@ -4,10 +4,7 @@ import cn.hutool.core.date.DateTime;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.paser.WebContentPaserByCss; import com.zzsn.crawler.paser.*;
import com.zzsn.crawler.paser.WebContentPaserByJsonXpath;
import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.HisURIConfig; import com.zzsn.crawler.uriparser.HisURIConfig;
import com.zzsn.crawler.uriparser.HisURIParser; import com.zzsn.crawler.uriparser.HisURIParser;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -94,8 +91,12 @@ public class DynaminSiteThread implements Runnable{ ...@@ -94,8 +91,12 @@ public class DynaminSiteThread implements Runnable{
}else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析 }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("4")){//智能解析IntelligeParse
WebContentPaserByIntellige webContentPaserByIntellige=new WebContentPaserByIntellige();
metaSearchList = webContentPaserByIntellige.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
} }
log.info(siteMsgTemple.getSiteName()+"本次获取列表: "+metaSearchList.size()+"个");
//资讯类容抽取 //资讯类容抽取
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型 //判断解析详情表达式类型
...@@ -103,27 +104,25 @@ public class DynaminSiteThread implements Runnable{ ...@@ -103,27 +104,25 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+DateTime.now()+"采集条数:"+docInfoList.size());
}else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByXpath.CatchWebNewsByXpath(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByXpath.CatchWebNewsByXpath(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+DateTime.now()+"采集条数:"+docInfoList.size());
}else if(siteMsgTemple.getDetailExpressionType().equals("1")){//jsonpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("1")){//jsonpath解析
WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath(); WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByJsonXpath.catchWebNewsByJsonPath(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByJsonXpath.catchWebNewsByJsonPath(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个");
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+DateTime.now()+"采集条数:"+docInfoList.size());
}else if(siteMsgTemple.getDetailExpressionType().equals("0")){//正则解析 }else if(siteMsgTemple.getDetailExpressionType().equals("0")){//正则解析
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
docInfoList = webContentPaserByRegular.catchWebNewsByRegular(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByRegular.catchWebNewsByRegular(metaSearchList, siteMsgTemple);
log.info("本次获取详情: "+docInfoList.size()+"个"); }else if(siteMsgTemple.getDetailExpressionType().equals("4")){//智能解析IntelligeParse
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+DateTime.now()+"采集条数:"+docInfoList.size()); WebContentPaserByIntellige webContentPaserByIntellige=new WebContentPaserByIntellige();
docInfoList = webContentPaserByIntellige.catchWebNewsByCSS(metaSearchList, siteMsgTemple);
} }
log.info(siteMsgTemple.getSiteName()+"本次获取详情: "+docInfoList.size()+"个");
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集结束时间:"+DateTime.now()+"采集条数:"+docInfoList.size());
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try { try {
......
...@@ -439,7 +439,7 @@ public class PaserSiteDownload { ...@@ -439,7 +439,7 @@ public class PaserSiteDownload {
try { try {
infodata = EntityUtils.toString(entitydata, charset); infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) { } catch (Exception e1) {
e1.printStackTrace(); return charset;
} }
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
...@@ -526,8 +526,9 @@ public class PaserSiteDownload { ...@@ -526,8 +526,9 @@ public class PaserSiteDownload {
// 获取所要抓取网页的编码方式 // 获取所要抓取网页的编码方式
public static String locateCharSet(String url) { public static String locateCharSet(String url) {
String encoding = "utf-8"; String encoding = "utf-8";
Connection conn=null;
try { try {
Connection conn = Jsoup.connect(url); conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"); conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
// 伪装成浏览器 // 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(5000).get(); Document doc = conn.ignoreContentType(true).timeout(5000).get();
......
package com.zzsn.crawler.paser; package com.zzsn.crawler.paser;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -9,21 +10,24 @@ import com.zzsn.util.*; ...@@ -9,21 +10,24 @@ import com.zzsn.util.*;
import lombok.Data; import lombok.Data;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.*;
import org.apache.http.Header; import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope; import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy; import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
...@@ -410,4 +414,54 @@ public class PaserCommDownload { ...@@ -410,4 +414,54 @@ public class PaserCommDownload {
return encoding; return encoding;
} }
public static String postData(String url, Map<String, Object> params) throws Exception {
//创建post请求对象
HttpPost httppost = new HttpPost(url);
// 获取到httpclient客户端
CloseableHttpClient httpclient = HttpClients.createDefault();
try {
//创建参数集合
List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>();
//添加请求头参数
// if(url.equals(GetOlapDataUrl)) {
// httppost.addHeader("Content-Type", "application/json");
// httppost.addHeader("accessToken",params.get("accessToken").toString());
// }
// 设置请求的一些配置设置,主要设置请求超时,连接超时等参数
RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(200000).setConnectionRequestTimeout(200000).setSocketTimeout(200000)
.build();
httppost.setConfig(requestConfig);
/**
生产Cookie
**/
httppost.setHeader("Cookie","xxxxxx");
//添加参数
httppost.setEntity(new StringEntity(JSONObject.toJSONString(params), ContentType.create("application/json", "utf-8")));
// 请求结果
String resultString = "";
//启动执行请求,并获得返回值
CloseableHttpResponse response = httpclient.execute(httppost);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
// 获取请求响应结果
HttpEntity entity = response.getEntity();
if (entity != null) {
// 将响应内容转换为指定编码的字符串
resultString = EntityUtils.toString(entity, "UTF-8");
// System.out.printf("Response content:{}", resultString);
return resultString;
}
} else {
System.out.println("请求失败!");
return resultString;
}
} catch (Exception e) {
throw e;
} finally {
httpclient.close();
}
return null;
}
} }
...@@ -29,7 +29,7 @@ public class SeleniumTime { ...@@ -29,7 +29,7 @@ public class SeleniumTime {
public static String getVerifyScopehtml(String url) { public static String getVerifyScopehtml(String url) {
String html = ""; String html = "";
ChromeOptions chromeOptions = new ChromeOptions(); ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver; ChromeDriver driver=null;
ChromeDriverService service = new ChromeDriverService.Builder(). ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build(); usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try { try {
...@@ -57,6 +57,10 @@ public class SeleniumTime { ...@@ -57,6 +57,10 @@ public class SeleniumTime {
} }
} catch (Exception e) { } catch (Exception e) {
log.info("驱动访问页面出现出现异常:" + e.getMessage()); log.info("驱动访问页面出现出现异常:" + e.getMessage());
}finally{
if(driver!=null) {
driver.quit();
}
} }
return html; return html;
} }
...@@ -72,10 +76,10 @@ public class SeleniumTime { ...@@ -72,10 +76,10 @@ public class SeleniumTime {
ReuseWebDriver driver = DriverUtil.getChromeDriver(); ReuseWebDriver driver = DriverUtil.getChromeDriver();
try { try {
Duration duration=Duration.of(100, ChronoUnit.SECONDS); Duration duration=Duration.of(50, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
Thread.sleep(1000); // Thread.sleep(1000);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
......
...@@ -5,6 +5,8 @@ import java.io.File; ...@@ -5,6 +5,8 @@ import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.PrintStream; import java.io.PrintStream;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import org.openqa.selenium.By; import org.openqa.selenium.By;
...@@ -49,12 +51,14 @@ public class SeleniumTime4 { ...@@ -49,12 +51,14 @@ public class SeleniumTime4 {
// ChromeDriver driver = new ChromeDriver(chromeOptions); // ChromeDriver driver = new ChromeDriver(chromeOptions);
//===================================================================================================== //=====================================================================================================
try{ try{
Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
try{ try{
Thread.sleep(3000l); Thread.sleep(2000l);
String html = webElement.getAttribute("outerHTML"); String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l); Thread.sleep(3000l);
driver.quit(); driver.quit();
// System.out.println(html); // System.out.println(html);
if(url.contains("http://www.flw.ph")){ if(url.contains("http://www.flw.ph")){
......
package com.zzsn.crawler.uriparser;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zzsn.generation.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
public class SeleniumVerify {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumVerify(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("--start-maximized");
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try{
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
Thread.sleep(3000l);
String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l);
driver.quit();
// System.out.println(html);
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(3000l);
String html = driver.getPageSource();
Thread.sleep(5000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}
try {
Thread.sleep(5000l);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
driver.quit();
return null;
}
public static void main(String[] args) {
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
...@@ -143,7 +143,7 @@ public class PageConnectioner { ...@@ -143,7 +143,7 @@ public class PageConnectioner {
/**构造下载使用的{@link HttpURLConnection} /**构造下载使用的{@link HttpURLConnection}
* @param urlstr 下载url (当参数类型是json字符串时调用) * @param urlstr 下载url (当参数类型是json字符串时调用)
* */ * */
protected HttpURLConnection connection(String urlstr,String params) throws Exception { public HttpURLConnection connection(String urlstr,String params) throws Exception {
URL url = null; URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT)); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpURLConnection connection = null; HttpURLConnection connection = null;
...@@ -658,7 +658,7 @@ public class PageConnectioner { ...@@ -658,7 +658,7 @@ public class PageConnectioner {
* @param postParam post参数,格式为raw(A=a&B=b) * @param postParam post参数,格式为raw(A=a&B=b)
* @return 下载的内容 * @return 下载的内容
*/ */
protected String staticConnectByPost(String url, String encoding, String postParam) { public String staticConnectByPost(String url, String encoding, String postParam) {
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
......
...@@ -29,6 +29,7 @@ public class DocInfo implements Serializable{ ...@@ -29,6 +29,7 @@ public class DocInfo implements Serializable{
private String title; private String title;
private String author; private String author;
private String publishDate; private String publishDate;
private String olPpublishDate;
private String origin; private String origin;
private String keywords; private String keywords;
private String summary; private String summary;
......
...@@ -5,6 +5,7 @@ import com.arronlong.httpclientutil.builder.HCB; ...@@ -5,6 +5,7 @@ import com.arronlong.httpclientutil.builder.HCB;
import com.arronlong.httpclientutil.common.HttpConfig; import com.arronlong.httpclientutil.common.HttpConfig;
import com.arronlong.httpclientutil.common.HttpHeader; import com.arronlong.httpclientutil.common.HttpHeader;
import com.arronlong.httpclientutil.exception.HttpProcessException; import com.arronlong.httpclientutil.exception.HttpProcessException;
import io.netty.handler.codec.Headers;
import org.apache.http.Header; import org.apache.http.Header;
import org.apache.http.client.HttpClient; import org.apache.http.client.HttpClient;
...@@ -26,26 +27,27 @@ public class HttpClientTest { ...@@ -26,26 +27,27 @@ public class HttpClientTest {
public static void testOne() throws HttpProcessException{ public static void testOne() throws HttpProcessException{
System.out.println("--------简单方式调用(默认post)--------"); // System.out.println("--------简单方式调用(默认post)--------");
String url = "https://www.cas.cn/zjs/"; String url = "http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg";
HttpConfig config = HttpConfig.custom(); // HttpConfig config = HttpConfig.custom();
//简单调用 // //简单调用
String resp = HttpClientUtil.get(config.url(url)); // String resp = HttpClientUtil.get(config.url(url));
//
System.out.println("请求结果内容长度:"+ resp.length()); // System.out.println("请求结果内容长度:"+ resp.length());
System.out.println("请求结果内容长度:"+ resp); // System.out.println("请求结果内容长度:"+ resp);
//
System.out.println("\n#################################\n"); // System.out.println("\n#################################\n");
//
System.out.println("--------加入header设置--------"); // System.out.println("--------加入header设置--------");
url="http://blog.csdn.net/xiaoxian8023"; // url="http://blog.csdn.net/xiaoxian8023";
//设置header信息 // //设置header信息
Header[] headers=HttpHeader.custom().userAgent("Mozilla/5.0").build(); Header[] headers=HttpHeader.custom().userAgent("Mozilla/5.0").build();
//执行请求 // //执行请求
resp = HttpClientUtil.get(config.headers(headers)); // resp = HttpClientUtil.get(config.headers(headers));
System.out.println("请求结果内容长度:"+ resp.length()); //
// System.out.println("请求结果内容长度:"+ resp.length());
System.out.println("\n#################################\n"); //
// System.out.println("\n#################################\n");
// System.out.println("--------代理设置(绕过证书验证)-------"); // System.out.println("--------代理设置(绕过证书验证)-------");
// url="https://www.facebook.com/"; // url="https://www.facebook.com/";
...@@ -60,25 +62,25 @@ public class HttpClientTest { ...@@ -60,25 +62,25 @@ public class HttpClientTest {
// url = "https://sso.tgb.com:8443/cas/login"; // url = "https://sso.tgb.com:8443/cas/login";
// client= HCB.custom().timeout(10000).ssl("D:\\keys\\wsriakey","tomcat").build(); // client= HCB.custom().timeout(10000).ssl("D:\\keys\\wsriakey","tomcat").build();
// headers=HttpHeader.custom().keepAlive("false").connection("close").contentType(Headers.APP_FORM_URLENCODED).build(); // headers=HttpHeader.custom().keepAlive("false").connection("close").contentType(Headers.APP_FORM_URLENCODED).build();
// //执行请求 //执行请求
// resp = CopyOfHttpClientUtil.get(config.method(HttpMethods.GET)); // resp = CopyOfHttpClientUtil.get(config.method(HttpMethods.GET));
// System.out.println("请求结果内容长度:"+ resp.length()); // System.out.println("请求结果内容长度:"+ resp.length());
// try { try {
// System.out.println("--------下载测试-------"); System.out.println("--------下载测试-------");
// url="http://ss.bdimg.com/static/superman/img/logo/logo_white_fe6da1ec.png"; // url="http://ss.bdimg.com/static/superman/img/logo/logo_white_fe6da1ec.png";
// FileOutputStream out = new FileOutputStream(new File("d://aaa//000.png")); FileOutputStream out = new FileOutputStream(new File("d://aaa//000.png"));
// HttpClientUtil.down(HttpConfig.custom().url(url).out(out)); HttpClientUtil.down(HttpConfig.custom().url(url).out(out));
out.flush();
out.close();
System.out.println("--------下载测试+代理-------");
// out = new FileOutputStream(new File("d://aaa//001.png"));
// HttpClientUtil.down(HttpConfig.custom().client(client).url(url).out(out));
// out.flush(); // out.flush();
// out.close(); // out.close();
// System.out.println("--------下载测试+代理-------"); } catch (IOException e) {
// e.printStackTrace();
//// out = new FileOutputStream(new File("d://aaa//001.png")); }
//// HttpClientUtil.down(HttpConfig.custom().client(client).url(url).out(out));
//// out.flush();
//// out.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
System.out.println("\n#################################\n"); System.out.println("\n#################################\n");
} }
......
package com.zzsn.test;
import com.zzsn.generation.Constants;
import java.io.*;
public class JSUtil {
/*
* 如果要更换运行环境,请注意exePath最后的phantom.exe需要更改。因为这个只能在window版本上运行。前面的路径名
*也需要和exePath里面的保持一致。否则无法调用
*/
private static String projectPath = Constants.CONTENT_DIR;
private static String jsPath = projectPath + File.separator + "examples"+ File.separator+"hello.js";
private static String exePath = projectPath + File.separator + "bin" + File.separator
+ "phantomjs.exe";
public static void main(String[] args) throws IOException {
// 测试调用。传入url即可
String html = getParseredHtml("https://www.baidu.com/");
System.out.println("html: " + html);
// getHtml("http://wenku.kuryun.com/docs/phantomjs/quickstart.html");
}
// 调用phantomjs程序,并传入js文件,并通过流拿回需要的数据。
public static String getParseredHtml(String url) throws IOException
{
StringBuffer sbf = new StringBuffer();
try {
Runtime rt = Runtime.getRuntime();
System.out.println(exePath + " " + jsPath + " " + url);
Process p = rt.exec(exePath + " " + jsPath + " " + url);
InputStream is = p.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String tmp = "";
while ((tmp = br.readLine()) != null) {
sbf.append(tmp);
}
System.out.println(sbf.toString());
}catch (Exception e){
e.printStackTrace();
}
return sbf.toString();
}
public static void getHtml(String url)
{
String HTML="";
System.out.println(jsPath);
System.out.println(exePath);
Runtime rt = Runtime.getRuntime();
Process p;
try {
p = rt.exec("C:\\Users\\WIN10\\Desktop\\windows\\bin\\phantomjs.exe" + " " + "C:\\Users\\WIN10\\Desktop\\windows\\codes.js" + " " + url);
InputStream is = p.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer sbf = new StringBuffer();
String tmp = "";
while ((tmp = br.readLine()) != null)
{
sbf.append(tmp);
}
HTML=sbf.toString();
System.out.println(HTML);
is.close();
br.close();
sbf=null;
is=null;
br=null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.zzsn.test;
import com.zzsn.generation.Constants;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.UUID;
public class PhtomjsTest {
public static Constants constants = new Constants();
public static String dirPath="";
public static String syste="";
public static String jssPath ="";
static{
// String pathMsg=pluginUtil.getCrawlerInputPath();
String pathMsg="output|windows|output/phantomjs/omit.js|";
String msg[]=pathMsg.split("\\|");
dirPath=msg[0];
syste=msg[1];
jssPath= msg[2];
}
private String phantomjsPath() throws Exception {
String system = syste;
if("linux32".equals(system)){
return dirPath+ File.separator+"phantomjs"+File.separator+"linux32"+File.separator+"bin"+File.separator+"phantomjs";
} else if("linux64".equals(system)){
return dirPath+File.separator+"bin"+File.separator+"phantomjs";
} else if("windows".equals(system)){
return dirPath+File.separator+"phantomjs"+File.separator+"windows"+File.separator+"bin"+File.separator+"phantomjs.exe";
} else if("mac".equals(system)){
return dirPath+File.separator+"phantomjs"+File.separator+"mac"+File.separator+"bin"+File.separator+"phantomjs";
} else {
throw new Exception(dirPath+File.separator+"screenshot.yml中的system配置错误");
}
}
/**
*
* @param screenshotPath
* 图片存储路劲
* @param imageName
* 图片名称
* @param httpUrl
* 请求的url
* @param width
* 截图的宽度
* @param height
* 截图的高度
* @return
*/
public String screenshot(String screenshotPath,String imageName,String httpUrl,int width,int height) throws Exception {
if(imageName == null || "".equals(imageName.trim())){
imageName = UUID.randomUUID().toString();
}
imageName += ".jpg";
String imageUrl=screenshotPath+"/"+imageName;
String jsPath =jssPath;
String phantomjsPath = phantomjsPath();
String BLANK = " ";
// 你的phantomjs.exe路径
// 就是上文中那段javascript脚本的存放路径
// 你的目标url地址
// 你的图片输出路径
Process process = Runtime.getRuntime().exec(
phantomjsPath + BLANK
+ jsPath + BLANK
+ httpUrl + BLANK
+ imageUrl + BLANK
+ width + BLANK
+ height);
InputStream inputStream = process.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String tmp = "";
while ((tmp = reader.readLine()) != null) {
int count = 0;
if (reader != null) {
reader.close();
count++;
}
if (process != null) {
process.destroy();
process = null;
count++;
}
if (count != 0){
break;
}
}
return imageUrl;
}
}
package com.zzsn.test; package com.test;
import java.io.*;
import org.apache.commons.httpclient.Header; import java.net.HttpURLConnection;
import org.apache.commons.httpclient.HttpClient; import java.net.URL;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
import java.util.*;
public class test { public class test {
public static void main(String[] args) throws IOException {
public static void main(String[] args) throws Exception{ String urlStr="http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg";
//时间戳 String fileName="testImg.png";
long timestamp = new Date().getTime(); downLoadByUrl(urlStr,fileName);
//请求地址 }
String url = "https://auth.xhszjs.com/uias/initLogin.do?appCode=silkroad_znsj?_=" + timestamp; /**
HttpClient client = new HttpClient(); * 从网络Url中下载文件
//post请求方式 * @param urlStr
PostMethod postMethod = new PostMethod(url); * @throws IOException
//推荐的数据存储方式,类似key-value形式 */
NameValuePair telPair = new NameValuePair(); public static String downLoadByUrl(String urlStr,String fileName) throws IOException {
telPair.setName("loginname"); URL url = new URL(urlStr);
telPair.setValue("z_dianxin_zhangqiankun"); HttpURLConnection conn = (HttpURLConnection)url.openConnection();
NameValuePair pwdPair = new NameValuePair("j_password","zqk@9988"); //设置超时间为3秒
//封装请求参数 conn.setConnectTimeout(5*1000);
postMethod.setRequestBody(new NameValuePair[]{telPair,pwdPair}); //防止屏蔽程序抓取而返回403错误
//这里是设置请求内容为json格式,根据站点的格式决定 // conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//因为这个网站会将账号密码转为json格式,所以需要这一步 conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
postMethod.setRequestHeader("Content_Type","application/x-www-form-urlencoded"); // conn.setRequestProperty("Accept-Language", " zh-CN,zh;q=0.9");
//执行请求 // conn.setRequestProperty("Cache-Control", " no-cache");
client.executeMethod(postMethod); // conn.setRequestProperty("Connection", " keep-alive");
//通过Post/GetMethod对象获取响应头信息 // conn.setRequestProperty("Host", " ccecc.crcc.cn");
// String cookie = postMethod.getResponseHeader("Set-Cookie").getValue(); // conn.setRequestProperty("Pragma", " no-cache");
//截取需要的内容 // conn.setRequestProperty("Upgrade-Insecure-Requests", " 1");
// String sub = cookie.substring(cookie.indexOf("&"), cookie.lastIndexOf("&")); conn.setRequestProperty("User-Agent", " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36");
// String[] splitPwd = sub.split("="); //得到输入流
// String pwd = splitPwd[1]; InputStream inputStream = conn.getInputStream();
// System.out.println(cookie); //获取自己数组
byte[] getData = readInputStream(inputStream);
//获取cookie //获取项目根目录地址
Header[] headers = postMethod.getResponseHeaders(); String propertiesFile ="D://";
for (int i = 0; i < headers.length; i++) { //文件保存位置
System.out.println(headers[i].getName()+":"+headers[i].getValue()); File saveDir = new File(propertiesFile);
if(!saveDir.exists()){
saveDir.mkdir();
} }
File file = new File(saveDir+ File.separator+fileName);
FileOutputStream fos = new FileOutputStream(file);
fos.write(getData);
if(fos!=null){
fos.close();
}
if(inputStream!=null){
inputStream.close();
}
return propertiesFile+"/"+fileName;
} }
/**
* 从输入流中获取字节数组
* @param inputStream
* @return
* @throws IOException
*/
public static byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}
} }
...@@ -159,7 +159,8 @@ public class PublishDateUtil { ...@@ -159,7 +159,8 @@ public class PublishDateUtil {
}else }else
{ {
return formatUSDate(raw); // return formatUSDate(raw);
return raw;
} }
} }
......
DEV_MODEL=0 DEV_MODEL=0
CONTENT_DIR=D\://toy/dest/content CONTENT_DIR=E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\phantomjs\\windows
IMG_DIR=D\://toy/dest/img IMG_DIR=D\://toy/dest/img
SAVE_LIMIT_SIZE=1 SAVE_LIMIT_SIZE=1
AUTO_KEYWORDS_SIZE=10 AUTO_KEYWORDS_SIZE=10
...@@ -69,15 +69,18 @@ KAFKA_PRODUCT_PARTITION=0 ...@@ -69,15 +69,18 @@ KAFKA_PRODUCT_PARTITION=0
# Redis settings # Redis settings
redis.host=114.116.26.150 #redis.host=114.116.26.150
redis.port=6379 #redis.port=6379
redis.pass=zzsn9988 #redis.pass=zzsn9988
#redis.host=114.115.236.206 #redis.host=114.115.236.206
#redis.port=6379 #redis.port=6379
#redis.pass=clbzzsn #redis.pass=clbzzsn
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
redis.host=127.0.0.1
redis.port=6379
redis.pass=xxxxxx
redis.timeout=10000 redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
...@@ -106,5 +109,3 @@ selenium.driver.cache=selenium_driver_cache_loc112 ...@@ -106,5 +109,3 @@ selenium.driver.cache=selenium_driver_cache_loc112
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# [PhantomJS](http://phantomjs.org) - Scriptable Headless WebKit
PhantomJS ([phantomjs.org](http://phantomjs.org)) is a headless WebKit scriptable with JavaScript. The latest [stable release](http://phantomjs.org/release-2.0.html) is version 2.0.
**Note**: Please **do not** create a GitHub pull request **without** reading the [Contribution Guide](https://github.com/ariya/phantomjs/blob/master/CONTRIBUTING.md) first. Failure to do so may result in the rejection of the pull request.
## Use Cases
- **Headless web testing**. Lightning-fast testing without the browser is now possible!
- **Page automation**. [Access and manipulate](http://phantomjs.org/page-automation.html) web pages with the standard DOM API, or with usual libraries like jQuery.
- **Screen capture**. Programmatically [capture web contents](http://phantomjs.org/screen-capture.html), including CSS, SVG and Canvas. Build server-side web graphics apps, from a screenshot service to a vector chart rasterizer.
- **Network monitoring**. Automate performance analysis, track [page loading](http://phantomjs.org/network-monitoring.html) and export as standard HAR format.
## Features
- **Multiplatform**, available on major operating systems: Windows, Mac OS X, Linux, and other Unices.
- **Fast and native implementation** of web standards: DOM, CSS, JavaScript, Canvas, and SVG. No emulation!
- **Pure headless (no X11) on Linux**, ideal for continuous integration systems. Also runs on Amazon EC2, Heroku, and Iron.io.
- **Easy to install**: [Download](http://phantomjs.org/download.html), unpack, and start having fun in just 5 minutes.
## Questions?
- Explore the complete [documentation](http://phantomjs.org/documentation/).
- Read tons of [user articles](http://phantomjs.org/buzz.html) on using PhantomJS.
- Join the [mailing-list](http://groups.google.com/group/phantomjs) and discuss with other PhantomJS fans.
PhantomJS is free software/open source, and is distributed under the [BSD license](http://opensource.org/licenses/BSD-3-Clause). It contains third-party code, see the included `third-party.txt` file for the license information on third-party code.
PhantomJS is created and maintained by [Ariya Hidayat](http://ariya.ofilabs.com/about) (Twitter: [@ariyahidayat](http://twitter.com/ariyahidayat)), with the help of [many contributors](https://github.com/ariya/phantomjs/contributors). Follow the official Twitter stream [@PhantomJS](http://twitter.com/PhantomJS) to get the frequent development updates.
"use strict";
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some args when invoking this script!');
} else {
system.args.forEach(function (arg, i) {
console.log(i + ': ' + arg);
});
}
phantom.exit();
"use strict";
var spawn = require("child_process").spawn
var execFile = require("child_process").execFile
var child = spawn("ls", ["-lF", "/rooot"])
child.stdout.on("data", function (data) {
console.log("spawnSTDOUT:", JSON.stringify(data))
})
child.stderr.on("data", function (data) {
console.log("spawnSTDERR:", JSON.stringify(data))
})
child.on("exit", function (code) {
console.log("spawnEXIT:", code)
})
//child.kill("SIGKILL")
execFile("ls", ["-lF", "/usr"], null, function (err, stdout, stderr) {
console.log("execFileSTDOUT:", JSON.stringify(stdout))
console.log("execFileSTDERR:", JSON.stringify(stderr))
})
setTimeout(function () {
phantom.exit(0)
}, 2000)
"use strict";
var page = require('webpage').create();
page.viewportSize = { width: 400, height : 400 };
page.content = '<html><body><canvas id="surface"></canvas></body></html>';
page.evaluate(function() {
var el = document.getElementById('surface'),
context = el.getContext('2d'),
width = window.innerWidth,
height = window.innerHeight,
cx = width / 2,
cy = height / 2,
radius = width / 2.3,
imageData,
pixels,
hue, sat, value,
i = 0, x, y, rx, ry, d,
f, g, p, u, v, w, rgb;
el.width = width;
el.height = height;
imageData = context.createImageData(width, height);
pixels = imageData.data;
for (y = 0; y < height; y = y + 1) {
for (x = 0; x < width; x = x + 1, i = i + 4) {
rx = x - cx;
ry = y - cy;
d = rx * rx + ry * ry;
if (d < radius * radius) {
hue = 6 * (Math.atan2(ry, rx) + Math.PI) / (2 * Math.PI);
sat = Math.sqrt(d) / radius;
g = Math.floor(hue);
f = hue - g;
u = 255 * (1 - sat);
v = 255 * (1 - sat * f);
w = 255 * (1 - sat * (1 - f));
pixels[i] = [255, v, u, u, w, 255, 255][g];
pixels[i + 1] = [w, 255, 255, v, u, u, w][g];
pixels[i + 2] = [u, u, w, 255, 255, v, u][g];
pixels[i + 3] = 255;
}
}
}
context.putImageData(imageData, 0, 0);
document.body.style.backgroundColor = 'white';
document.body.style.margin = '0px';
});
page.render('colorwheel.png');
phantom.exit();
"use strict";
var t = 10,
interval = setInterval(function(){
if ( t > 0 ) {
console.log(t--);
} else {
console.log("BLAST OFF!");
phantom.exit();
}
}, 1000);
// Detect if a web page sniffs the user agent or not.
"use strict";
var page = require('webpage').create(),
system = require('system'),
sniffed,
address;
page.onInitialized = function () {
page.evaluate(function () {
(function () {
var userAgent = window.navigator.userAgent,
platform = window.navigator.platform;
window.navigator = {
appCodeName: 'Mozilla',
appName: 'Netscape',
cookieEnabled: false,
sniffed: false
};
window.navigator.__defineGetter__('userAgent', function () {
window.navigator.sniffed = true;
return userAgent;
});
window.navigator.__defineGetter__('platform', function () {
window.navigator.sniffed = true;
return platform;
});
})();
});
};
if (system.args.length === 1) {
console.log('Usage: detectsniff.js <some URL>');
phantom.exit(1);
} else {
address = system.args[1];
console.log('Checking ' + address + '...');
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit();
} else {
window.setTimeout(function () {
sniffed = page.evaluate(function () {
return navigator.sniffed;
});
if (sniffed) {
console.log('The page tried to sniff the user agent.');
} else {
console.log('The page did not try to sniff the user agent.');
}
phantom.exit();
}, 1500);
}
});
}
// echoToFile.js - Write in a given file all the parameters passed on the CLI
"use strict";
var fs = require('fs'),
system = require('system');
if (system.args.length < 3) {
console.log("Usage: echoToFile.js DESTINATION_FILE <arguments to echo...>");
phantom.exit(1);
} else {
var content = '',
f = null,
i;
for ( i= 2; i < system.args.length; ++i ) {
content += system.args[i] + (i === system.args.length-1 ? '' : ' ');
}
try {
fs.write(system.args[1], content, 'w');
} catch(e) {
console.log(e);
}
phantom.exit();
}
"use strict";
var feature, supported = [], unsupported = [];
phantom.injectJs('modernizr.js');
console.log('Detected features (using Modernizr ' + Modernizr._version + '):');
for (feature in Modernizr) {
if (Modernizr.hasOwnProperty(feature)) {
if (feature[0] !== '_' && typeof Modernizr[feature] !== 'function' &&
feature !== 'input' && feature !== 'inputtypes') {
if (Modernizr[feature]) {
supported.push(feature);
} else {
unsupported.push(feature);
}
}
}
}
console.log('');
console.log('Supported:');
supported.forEach(function (e) {
console.log(' ' + e);
});
console.log('');
console.log('Not supported:');
unsupported.forEach(function (e) {
console.log(' ' + e);
});
phantom.exit();
"use strict";
var fibs = [0, 1];
var ticker = window.setInterval(function () {
console.log(fibs[fibs.length - 1]);
fibs.push(fibs[fibs.length - 1] + fibs[fibs.length - 2]);
if (fibs.length > 10) {
window.clearInterval(ticker);
phantom.exit();
}
}, 300);
"use strict";
console.log('Hello, world!');
phantom.exit();
// Use 'page.injectJs()' to load the script itself in the Page context
"use strict";
if ( typeof(phantom) !== "undefined" ) {
var page = require('webpage').create();
// Route "console.log()" calls from within the Page context to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.onAlert = function(msg) {
console.log(msg);
};
console.log("* Script running in the Phantom context.");
console.log("* Script will 'inject' itself in a page...");
page.open("about:blank", function(status) {
if ( status === "success" ) {
console.log(page.injectJs("injectme.js") ? "... done injecting itself!" : "... fail! Check the $PWD?!");
}
phantom.exit();
});
} else {
alert("* Script running in the Page context.");
}
"use strict";
var page = require('webpage').create(),
system = require('system'),
t, address;
if (system.args.length === 1) {
console.log('Usage: loadspeed.js <some URL>');
phantom.exit(1);
} else {
t = Date.now();
address = system.args[1];
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
} else {
t = Date.now() - t;
console.log('Page title is ' + page.evaluate(function () {
return document.title;
}));
console.log('Loading time ' + t + ' msec');
}
phantom.exit();
});
}
"use strict";
var page = require('webpage').create(),
system = require('system');
if (system.args.length < 2) {
console.log('Usage: loadurlwithoutcss.js URL');
phantom.exit();
}
var address = system.args[1];
page.onResourceRequested = function(requestData, request) {
if ((/http:\/\/.+?\.css/gi).test(requestData['url']) || requestData.headers['Content-Type'] == 'text/css') {
console.log('The url of the request is matching. Aborting: ' + requestData['url']);
request.abort();
}
};
page.open(address, function(status) {
if (status === 'success') {
phantom.exit();
} else {
console.log('Unable to load the address!');
phantom.exit();
}
});
\ No newline at end of file
"use strict";
var universe = require('./universe');
universe.start();
console.log('The answer is ' + universe.answer);
phantom.exit();
"use strict";
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log('Usage: netlog.js <some URL>');
phantom.exit(1);
} else {
address = system.args[1];
page.onResourceRequested = function (req) {
console.log('requested: ' + JSON.stringify(req, undefined, 4));
};
page.onResourceReceived = function (res) {
console.log('received: ' + JSON.stringify(res, undefined, 4));
};
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
}
phantom.exit();
});
}
"use strict";
if (!Date.prototype.toISOString) {
Date.prototype.toISOString = function () {
function pad(n) { return n < 10 ? '0' + n : n; }
function ms(n) { return n < 10 ? '00'+ n : n < 100 ? '0' + n : n }
return this.getFullYear() + '-' +
pad(this.getMonth() + 1) + '-' +
pad(this.getDate()) + 'T' +
pad(this.getHours()) + ':' +
pad(this.getMinutes()) + ':' +
pad(this.getSeconds()) + '.' +
ms(this.getMilliseconds()) + 'Z';
}
}
function createHAR(address, title, startTime, resources)
{
var entries = [];
resources.forEach(function (resource) {
var request = resource.request,
startReply = resource.startReply,
endReply = resource.endReply;
if (!request || !startReply || !endReply) {
return;
}
// Exclude Data URI from HAR file because
// they aren't included in specification
if (request.url.match(/(^data:image\/.*)/i)) {
return;
}
entries.push({
startedDateTime: request.time.toISOString(),
time: endReply.time - request.time,
request: {
method: request.method,
url: request.url,
httpVersion: "HTTP/1.1",
cookies: [],
headers: request.headers,
queryString: [],
headersSize: -1,
bodySize: -1
},
response: {
status: endReply.status,
statusText: endReply.statusText,
httpVersion: "HTTP/1.1",
cookies: [],
headers: endReply.headers,
redirectURL: "",
headersSize: -1,
bodySize: startReply.bodySize,
content: {
size: startReply.bodySize,
mimeType: endReply.contentType
}
},
cache: {},
timings: {
blocked: 0,
dns: -1,
connect: -1,
send: 0,
wait: startReply.time - request.time,
receive: endReply.time - startReply.time,
ssl: -1
},
pageref: address
});
});
return {
log: {
version: '1.2',
creator: {
name: "PhantomJS",
version: phantom.version.major + '.' + phantom.version.minor +
'.' + phantom.version.patch
},
pages: [{
startedDateTime: startTime.toISOString(),
id: address,
title: title,
pageTimings: {
onLoad: page.endTime - page.startTime
}
}],
entries: entries
}
};
}
var page = require('webpage').create(),
system = require('system');
if (system.args.length === 1) {
console.log('Usage: netsniff.js <some URL>');
phantom.exit(1);
} else {
page.address = system.args[1];
page.resources = [];
page.onLoadStarted = function () {
page.startTime = new Date();
};
page.onResourceRequested = function (req) {
page.resources[req.id] = {
request: req,
startReply: null,
endReply: null
};
};
page.onResourceReceived = function (res) {
if (res.stage === 'start') {
page.resources[res.id].startReply = res;
}
if (res.stage === 'end') {
page.resources[res.id].endReply = res;
}
};
page.open(page.address, function (status) {
var har;
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit(1);
} else {
page.endTime = new Date();
page.title = page.evaluate(function () {
return document.title;
});
har = createHAR(page.address, page.title, page.startTime, page.resources);
console.log(JSON.stringify(har, undefined, 4));
phantom.exit();
}
});
}
"use strict";
var page = require('webpage').create(),
system = require('system'),
host, port, address;
if (system.args.length < 4) {
console.log('Usage: openurlwithproxy.js <proxyHost> <proxyPort> <URL>');
phantom.exit(1);
} else {
host = system.args[1];
port = system.args[2];
address = system.args[3];
phantom.setProxy(host, port, 'manual', '', '');
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address "' +
address + '" using proxy "' + host + ':' + port + '"');
} else {
console.log('Page title is ' + page.evaluate(function () {
return document.title;
}));
}
phantom.exit();
});
}
"use strict";
function helloWorld() {
console.log(phantom.outputEncoding + ": こんにちは、世界!");
}
console.log("Using default encoding...");
helloWorld();
console.log("\nUsing other encodings...");
var encodings = ["euc-jp", "sjis", "utf8", "System"];
for (var i = 0; i < encodings.length; i++) {
phantom.outputEncoding = encodings[i];
helloWorld();
}
phantom.exit()
// The purpose of this is to show how and when events fire, considering 5 steps
// happening as follows:
//
// 1. Load URL
// 2. Load same URL, but adding an internal FRAGMENT to it
// 3. Click on an internal Link, that points to another internal FRAGMENT
// 4. Click on an external Link, that will send the page somewhere else
// 5. Close page
//
// Take particular care when going through the output, to understand when
// things happen (and in which order). Particularly, notice what DOESN'T
// happen during step 3.
//
// If invoked with "-v" it will print out the Page Resources as they are
// Requested and Received.
//
// NOTE.1: The "onConsoleMessage/onAlert/onPrompt/onConfirm" events are
// registered but not used here. This is left for you to have fun with.
// NOTE.2: This script is not here to teach you ANY JavaScript. It's aweful!
// NOTE.3: Main audience for this are people new to PhantomJS.
"use strict";
var sys = require("system"),
page = require("webpage").create(),
logResources = false,
step1url = "http://en.wikipedia.org/wiki/DOM_events",
step2url = "http://en.wikipedia.org/wiki/DOM_events#Event_flow";
if (sys.args.length > 1 && sys.args[1] === "-v") {
logResources = true;
}
function printArgs() {
var i, ilen;
for (i = 0, ilen = arguments.length; i < ilen; ++i) {
console.log(" arguments[" + i + "] = " + JSON.stringify(arguments[i]));
}
console.log("");
}
////////////////////////////////////////////////////////////////////////////////
page.onInitialized = function() {
console.log("page.onInitialized");
printArgs.apply(this, arguments);
};
page.onLoadStarted = function() {
console.log("page.onLoadStarted");
printArgs.apply(this, arguments);
};
page.onLoadFinished = function() {
console.log("page.onLoadFinished");
printArgs.apply(this, arguments);
};
page.onUrlChanged = function() {
console.log("page.onUrlChanged");
printArgs.apply(this, arguments);
};
page.onNavigationRequested = function() {
console.log("page.onNavigationRequested");
printArgs.apply(this, arguments);
};
page.onRepaintRequested = function() {
console.log("page.onRepaintRequested");
printArgs.apply(this, arguments);
};
if (logResources === true) {
page.onResourceRequested = function() {
console.log("page.onResourceRequested");
printArgs.apply(this, arguments);
};
page.onResourceReceived = function() {
console.log("page.onResourceReceived");
printArgs.apply(this, arguments);
};
}
page.onClosing = function() {
console.log("page.onClosing");
printArgs.apply(this, arguments);
};
// window.console.log(msg);
page.onConsoleMessage = function() {
console.log("page.onConsoleMessage");
printArgs.apply(this, arguments);
};
// window.alert(msg);
page.onAlert = function() {
console.log("page.onAlert");
printArgs.apply(this, arguments);
};
// var confirmed = window.confirm(msg);
page.onConfirm = function() {
console.log("page.onConfirm");
printArgs.apply(this, arguments);
};
// var user_value = window.prompt(msg, default_value);
page.onPrompt = function() {
console.log("page.onPrompt");
printArgs.apply(this, arguments);
};
////////////////////////////////////////////////////////////////////////////////
setTimeout(function() {
console.log("");
console.log("### STEP 1: Load '" + step1url + "'");
page.open(step1url);
}, 0);
setTimeout(function() {
console.log("");
console.log("### STEP 2: Load '" + step2url + "' (load same URL plus FRAGMENT)");
page.open(step2url);
}, 5000);
setTimeout(function() {
console.log("");
console.log("### STEP 3: Click on page internal link (aka FRAGMENT)");
page.evaluate(function() {
var ev = document.createEvent("MouseEvents");
ev.initEvent("click", true, true);
document.querySelector("a[href='#Event_object']").dispatchEvent(ev);
});
}, 10000);
setTimeout(function() {
console.log("");
console.log("### STEP 4: Click on page external link");
page.evaluate(function() {
var ev = document.createEvent("MouseEvents");
ev.initEvent("click", true, true);
document.querySelector("a[title='JavaScript']").dispatchEvent(ev);
});
}, 15000);
setTimeout(function() {
console.log("");
console.log("### STEP 5: Close page and shutdown (with a delay)");
page.close();
setTimeout(function(){
phantom.exit();
}, 100);
}, 20000);
"use strict";
var p = require("webpage").create();
p.onConsoleMessage = function(msg) { console.log(msg); };
// Calls to "callPhantom" within the page 'p' arrive here
p.onCallback = function(msg) {
console.log("Received by the 'phantom' main context: "+msg);
return "Hello there, I'm coming to you from the 'phantom' context instead";
};
p.evaluate(function() {
// Return-value of the "onCallback" handler arrive here
var callbackResponse = window.callPhantom("Hello, I'm coming to you from the 'page' context");
console.log("Received by the 'page' context: "+callbackResponse);
});
phantom.exit();
// Read the Phantom webpage '#intro' element text using jQuery and "includeJs"
"use strict";
var page = require('webpage').create();
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open("http://phantomjs.org/", function(status) {
if (status === "success") {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
console.log("$(\".explanation\").text() -> " + $(".explanation").text());
});
phantom.exit(0);
});
} else {
phantom.exit(1);
}
});
// Example using HTTP POST operation
"use strict";
var page = require('webpage').create(),
server = 'http://posttestserver.com/post.php?dump',
data = 'universe=expanding&answer=42';
page.open(server, 'post', data, function (status) {
if (status !== 'success') {
console.log('Unable to post!');
} else {
console.log(page.content);
}
phantom.exit();
});
// Example using HTTP POST operation
"use strict";
var page = require('webpage').create(),
server = 'http://posttestserver.com/post.php?dump',
data = '{"universe": "expanding", "answer": 42}';
var headers = {
"Content-Type": "application/json"
}
page.open(server, 'post', data, headers, function (status) {
if (status !== 'success') {
console.log('Unable to post!');
} else {
console.log(page.content);
}
phantom.exit();
});
// Example using HTTP POST operation
"use strict";
var page = require('webpage').create(),
server = require('webserver').create(),
system = require('system'),
data = 'universe=expanding&answer=42';
if (system.args.length !== 2) {
console.log('Usage: postserver.js <portnumber>');
phantom.exit(1);
}
var port = system.args[1];
service = server.listen(port, function (request, response) {
console.log('Request received at ' + new Date());
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.write(JSON.stringify(request, null, 4));
response.close();
});
page.open('http://localhost:' + port + '/', 'post', data, function (status) {
if (status !== 'success') {
console.log('Unable to post!');
} else {
console.log(page.plainText);
}
phantom.exit();
});
var system = require('system'),
env = system.env,
key;
for (key in env) {
if (env.hasOwnProperty(key)) {
console.log(key + '=' + env[key]);
}
}
phantom.exit();
"use strict";
var page = require('webpage').create(),
system = require('system');
function someCallback(pageNum, numPages) {
return "<h1> someCallback: " + pageNum + " / " + numPages + "</h1>";
}
if (system.args.length < 3) {
console.log('Usage: printheaderfooter.js URL filename');
phantom.exit(1);
} else {
var address = system.args[1];
var output = system.args[2];
page.viewportSize = { width: 600, height: 600 };
page.paperSize = {
format: 'A4',
margin: "1cm",
/* default header/footer for pages that don't have custom overwrites (see below) */
header: {
height: "1cm",
contents: phantom.callback(function(pageNum, numPages) {
if (pageNum == 1) {
return "";
}
return "<h1>Header <span style='float:right'>" + pageNum + " / " + numPages + "</span></h1>";
})
},
footer: {
height: "1cm",
contents: phantom.callback(function(pageNum, numPages) {
if (pageNum == numPages) {
return "";
}
return "<h1>Footer <span style='float:right'>" + pageNum + " / " + numPages + "</span></h1>";
})
}
};
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
} else {
/* check whether the loaded page overwrites the header/footer setting,
i.e. whether a PhantomJSPriting object exists. Use that then instead
of our defaults above.
example:
<html>
<head>
<script type="text/javascript">
var PhantomJSPrinting = {
header: {
height: "1cm",
contents: function(pageNum, numPages) { return pageNum + "/" + numPages; }
},
footer: {
height: "1cm",
contents: function(pageNum, numPages) { return pageNum + "/" + numPages; }
}
};
</script>
</head>
<body><h1>asdfadsf</h1><p>asdfadsfycvx</p></body>
</html>
*/
if (page.evaluate(function(){return typeof PhantomJSPrinting == "object";})) {
paperSize = page.paperSize;
paperSize.header.height = page.evaluate(function() {
return PhantomJSPrinting.header.height;
});
paperSize.header.contents = phantom.callback(function(pageNum, numPages) {
return page.evaluate(function(pageNum, numPages){return PhantomJSPrinting.header.contents(pageNum, numPages);}, pageNum, numPages);
});
paperSize.footer.height = page.evaluate(function() {
return PhantomJSPrinting.footer.height;
});
paperSize.footer.contents = phantom.callback(function(pageNum, numPages) {
return page.evaluate(function(pageNum, numPages){return PhantomJSPrinting.footer.contents(pageNum, numPages);}, pageNum, numPages);
});
page.paperSize = paperSize;
console.log(page.paperSize.header.height);
console.log(page.paperSize.footer.height);
}
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
}
"use strict";
var page = require('webpage').create(),
system = require('system');
if (system.args.length < 7) {
console.log('Usage: printmargins.js URL filename LEFT TOP RIGHT BOTTOM');
console.log(' margin examples: "1cm", "10px", "7mm", "5in"');
phantom.exit(1);
} else {
var address = system.args[1];
var output = system.args[2];
var marginLeft = system.args[3];
var marginTop = system.args[4];
var marginRight = system.args[5];
var marginBottom = system.args[6];
page.viewportSize = { width: 600, height: 600 };
page.paperSize = {
format: 'A4',
margin: {
left: marginLeft,
top: marginTop,
right: marginRight,
bottom: marginBottom
}
};
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
} else {
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
}
"use strict";
var page = require('webpage').create(),
system = require('system'),
address, output, size;
if (system.args.length < 3 || system.args.length > 5) {
console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
console.log(' "800px*600px" window, clipped to 800x600');
phantom.exit(1);
} else {
address = system.args[1];
output = system.args[2];
page.viewportSize = { width: 600, height: 600 };
if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
size = system.args[3].split('*');
page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
: { format: system.args[3], orientation: 'portrait', margin: '1cm' };
} else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
size = system.args[3].split('*');
if (size.length === 2) {
pageWidth = parseInt(size[0], 10);
pageHeight = parseInt(size[1], 10);
page.viewportSize = { width: pageWidth, height: pageHeight };
page.clipRect = { top: 0, left: 0, width: pageWidth, height: pageHeight };
} else {
console.log("size:", system.args[3]);
pageWidth = parseInt(system.args[3], 10);
pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
console.log ("pageHeight:",pageHeight);
page.viewportSize = { width: pageWidth, height: pageHeight };
}
}
if (system.args.length > 4) {
page.zoomFactor = system.args[4];
}
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit(1);
} else {
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
}
// Render Multiple URLs to file
"use strict";
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
@param array of URLs to render
@param callbackPerUrl Function called after finishing each URL, including the last URL
@param callbackFinal Function called after finishing everything
*/
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + urlIndex + ".png";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 800,
height: 600
};
page.settings.userAgent = "Phantom.js bot";
return page.open("http://" + url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
page.render(file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
arrayOfUrls = null;
if (system.args.length > 1) {
arrayOfUrls = Array.prototype.slice.call(system.args, 1);
} else {
console.log("Usage: phantomjs render_multi_url.js [domain.name1, domain.name2, ...]");
arrayOfUrls = ["www.google.com", "www.bbc.co.uk", "phantomjs.org"];
}
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
/**
* Captures the full height document even if it's not showing on the screen or captures with the provided range of screen sizes.
*
* A basic example for taking a screen shot using phantomjs which is sampled for https://nodejs-dersleri.github.io/
*
* usage : phantomjs responsive-screenshot.js {url} [output format] [doClipping]
*
* examples >
* phantomjs responsive-screenshot.js https://nodejs-dersleri.github.io/
* phantomjs responsive-screenshot.js https://nodejs-dersleri.github.io/ pdf
* phantomjs responsive-screenshot.js https://nodejs-dersleri.github.io/ true
* phantomjs responsive-screenshot.js https://nodejs-dersleri.github.io/ png true
*
* @author Salih sagdilek <salihsagdilek@gmail.com>
*/
/**
* http://phantomjs.org/api/system/property/args.html
*
* Queries and returns a list of the command-line arguments.
* The first one is always the script name, which is then followed by the subsequent arguments.
*/
var args = require('system').args;
/**
* http://phantomjs.org/api/fs/
*
* file system api
*/
var fs = require('fs');
/**
* http://phantomjs.org/api/webpage/
*
* Web page api
*/
var page = new WebPage();
/**
* if url address does not exist, exit phantom
*/
if ( 1 === args.length ) {
console.log('Url address is required');
phantom.exit();
}
/**
* setup url address (second argument);
*/
var urlAddress = args[1].toLowerCase();
/**
* set output extension format
* @type {*}
*/
var ext = getFileExtension();
/**
* set if clipping ?
* @type {boolean}
*/
var clipping = getClipping();
/**
* setup viewports
*/
var viewports = [
{
width : 1200,
height : 800
},
{
width : 1024,
height : 768
},
{
width : 768,
height : 1024
},
{
width : 480,
height : 640
},
{
width : 320,
height : 480
}
];
page.open(urlAddress, function (status) {
if ( 'success' !== status ) {
console.log('Unable to load the url address!');
} else {
var folder = urlToDir(urlAddress);
var output, key;
function render(n) {
if ( !!n ) {
key = n - 1;
page.viewportSize = viewports[key];
if ( clipping ) {
page.clipRect = viewports[key];
}
output = folder + "/" + getFileName(viewports[key]);
console.log('Saving ' + output);
page.render(output);
render(key);
}
}
render(viewports.length);
}
phantom.exit();
});
/**
* filename generator helper
* @param viewport
* @returns {string}
*/
function getFileName(viewport) {
var d = new Date();
var date = [
d.getUTCFullYear(),
d.getUTCMonth() + 1,
d.getUTCDate()
];
var time = [
d.getHours() <= 9 ? '0' + d.getHours() : d.getHours(),
d.getMinutes() <= 9 ? '0' + d.getMinutes() : d.getMinutes(),
d.getSeconds() <= 9 ? '0' + d.getSeconds() : d.getSeconds(),
d.getMilliseconds()
];
var resolution = viewport.width + (clipping ? "x" + viewport.height : '');
return date.join('-') + '_' + time.join('-') + "_" + resolution + ext;
}
/**
* output extension format helper
*
* @returns {*}
*/
function getFileExtension() {
if ( 'true' != args[2] && !!args[2] ) {
return '.' + args[2];
}
return '.png';
}
/**
* check if clipping
*
* @returns {boolean}
*/
function getClipping() {
if ( 'true' == args[3] ) {
return !!args[3];
} else if ( 'true' == args[2] ) {
return !!args[2];
}
return false;
}
/**
* url to directory helper
*
* @param url
* @returns {string}
*/
function urlToDir(url) {
var dir = url
.replace(/^(http|https):\/\//, '')
.replace(/\/$/, '');
if ( !fs.makeTree(dir) ) {
console.log('"' + dir + '" is NOT created.');
phantom.exit();
}
return dir;
}
"use strict";
var system = require('system');
/**
* Wait until the test condition is true or a timeout occurs. Useful for waiting
* on a server response or for a ui change (fadeIn, etc.) to occur.
*
* @param testFx javascript condition that evaluates to a boolean,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param onReady what to do when testFx condition is fulfilled,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
*/
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3001, //< Default Max Timeout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 100); //< repeat check every 100ms
};
if (system.args.length !== 2) {
console.log('Usage: run-jasmine.js URL');
phantom.exit(1);
}
var page = require('webpage').create();
// Route "console.log()" calls from within the Page context to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open(system.args[1], function(status){
if (status !== "success") {
console.log("Unable to open " + system.args[1]);
phantom.exit(1);
} else {
waitFor(function(){
return page.evaluate(function(){
return document.body.querySelector('.symbolSummary .pending') === null
});
}, function(){
var exitCode = page.evaluate(function(){
try {
console.log('');
console.log(document.body.querySelector('.description').innerText);
var list = document.body.querySelectorAll('.results > #details > .specDetail.failed');
if (list && list.length > 0) {
console.log('');
console.log(list.length + ' test(s) FAILED:');
for (i = 0; i < list.length; ++i) {
var el = list[i],
desc = el.querySelector('.description'),
msg = el.querySelector('.resultMessage.fail');
console.log('');
console.log(desc.innerText);
console.log(msg.innerText);
console.log('');
}
return 1;
} else {
console.log(document.body.querySelector('.alert > .passingAlert.bar').innerText);
return 0;
}
} catch (ex) {
console.log(ex);
return 1;
}
});
phantom.exit(exitCode);
});
}
});
"use strict";
var system = require('system');
/**
* Wait until the test condition is true or a timeout occurs. Useful for waiting
* on a server response or for a ui change (fadeIn, etc.) to occur.
*
* @param testFx javascript condition that evaluates to a boolean,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param onReady what to do when testFx condition is fulfilled,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
*/
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3001, //< Default Max Timeout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 100); //< repeat check every 100ms
};
if (system.args.length !== 2) {
console.log('Usage: run-jasmine2.js URL');
phantom.exit(1);
}
var page = require('webpage').create();
// Route "console.log()" calls from within the Page context to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open(system.args[1], function(status){
if (status !== "success") {
console.log("Unable to access network");
phantom.exit();
} else {
waitFor(function(){
return page.evaluate(function(){
return (document.body.querySelector('.symbolSummary .pending') === null &&
document.body.querySelector('.duration') !== null);
});
}, function(){
var exitCode = page.evaluate(function(){
console.log('');
var title = 'Jasmine';
var version = document.body.querySelector('.version').innerText;
var duration = document.body.querySelector('.duration').innerText;
var banner = title + ' ' + version + ' ' + duration;
console.log(banner);
var list = document.body.querySelectorAll('.results > .failures > .spec-detail.failed');
if (list && list.length > 0) {
console.log('');
console.log(list.length + ' test(s) FAILED:');
for (i = 0; i < list.length; ++i) {
var el = list[i],
desc = el.querySelector('.description'),
msg = el.querySelector('.messages > .result-message');
console.log('');
console.log(desc.innerText);
console.log(msg.innerText);
console.log('');
}
return 1;
} else {
console.log(document.body.querySelector('.alert > .bar.passed,.alert > .bar.skipped').innerText);
return 0;
}
});
phantom.exit(exitCode);
});
}
});
"use strict";
var system = require('system');
/**
* Wait until the test condition is true or a timeout occurs. Useful for waiting
* on a server response or for a ui change (fadeIn, etc.) to occur.
*
* @param testFx javascript condition that evaluates to a boolean,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param onReady what to do when testFx condition is fulfilled,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
*/
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3001, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 100); //< repeat check every 250ms
};
if (system.args.length !== 2) {
console.log('Usage: run-qunit.js URL');
phantom.exit(1);
}
var page = require('webpage').create();
// Route "console.log()" calls from within the Page context to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open(system.args[1], function(status){
if (status !== "success") {
console.log("Unable to access network");
phantom.exit(1);
} else {
waitFor(function(){
return page.evaluate(function(){
var el = document.getElementById('qunit-testresult');
if (el && el.innerText.match('completed')) {
return true;
}
return false;
});
}, function(){
var failedNum = page.evaluate(function(){
var el = document.getElementById('qunit-testresult');
console.log(el.innerText);
try {
return el.getElementsByClassName('failed')[0].innerHTML;
} catch (e) { }
return 10000;
});
phantom.exit((parseInt(failedNum, 10) > 0) ? 1 : 0);
});
}
});
// List all the files in a Tree of Directories
"use strict";
var system = require('system');
if (system.args.length !== 2) {
console.log("Usage: phantomjs scandir.js DIRECTORY_TO_SCAN");
phantom.exit(1);
}
var scanDirectory = function (path) {
var fs = require('fs');
if (fs.exists(path) && fs.isFile(path)) {
console.log(path);
} else if (fs.isDirectory(path)) {
fs.list(path).forEach(function (e) {
if ( e !== "." && e !== ".." ) { //< Avoid loops
scanDirectory(path + '/' + e);
}
});
}
};
scanDirectory(system.args[1]);
phantom.exit();
"use strict";
var page = require('webpage').create();
var server = require('webserver').create();
var system = require('system');
var host, port;
if (system.args.length !== 2) {
console.log('Usage: server.js <some port>');
phantom.exit(1);
} else {
port = system.args[1];
var listening = server.listen(port, function (request, response) {
console.log("GOT HTTP REQUEST");
console.log(JSON.stringify(request, null, 4));
// we set the headers here
response.statusCode = 200;
response.headers = {"Cache": "no-cache", "Content-Type": "text/html"};
// this is also possible:
response.setHeader("foo", "bar");
// now we write the body
// note: the headers above will now be sent implictly
response.write("<html><head><title>YES!</title></head>");
// note: writeBody can be called multiple times
response.write("<body><p>pretty cool :)</body></html>");
response.close();
});
if (!listening) {
console.log("could not create web server listening on port " + port);
phantom.exit();
}
var url = "http://localhost:" + port + "/foo/bar.php?asdf=true";
console.log("SENDING REQUEST TO:");
console.log(url);
page.open(url, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
} else {
console.log("GOT REPLY FROM SERVER:");
console.log(page.content);
}
phantom.exit();
});
}
"use strict";
var port, server, service,
system = require('system');
if (system.args.length !== 2) {
console.log('Usage: serverkeepalive.js <portnumber>');
phantom.exit(1);
} else {
port = system.args[1];
server = require('webserver').create();
service = server.listen(port, { keepAlive: true }, function (request, response) {
console.log('Request at ' + new Date());
console.log(JSON.stringify(request, null, 4));
var body = JSON.stringify(request, null, 4);
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain',
'Connection': 'Keep-Alive',
'Keep-Alive': 'timeout=5, max=100',
'Content-Length': body.length
};
response.write(body);
response.close();
});
if (service) {
console.log('Web server running on port ' + port);
} else {
console.log('Error: Could not create web server listening on port ' + port);
phantom.exit();
}
}
"use strict";
var port, server, service,
system = require('system');
if (system.args.length !== 2) {
console.log('Usage: simpleserver.js <portnumber>');
phantom.exit(1);
} else {
port = system.args[1];
server = require('webserver').create();
service = server.listen(port, function (request, response) {
console.log('Request at ' + new Date());
console.log(JSON.stringify(request, null, 4));
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/html'
};
response.write('<html>');
response.write('<head>');
response.write('<title>Hello, world!</title>');
response.write('</head>');
response.write('<body>');
response.write('<p>This is from PhantomJS web server.</p>');
response.write('<p>Request data:</p>');
response.write('<pre>');
response.write(JSON.stringify(request, null, 4));
response.write('</pre>');
response.write('</body>');
response.write('</html>');
response.close();
});
if (service) {
console.log('Web server running on port ' + port);
} else {
console.log('Error: Could not create web server listening on port ' + port);
phantom.exit();
}
}
// sleepsort.js - Sort integers from the commandline in a very ridiculous way: leveraging timeouts :P
"use strict";
var system = require('system');
function sleepSort(array, callback) {
var sortedCount = 0,
i, len;
for ( i = 0, len = array.length; i < len; ++i ) {
setTimeout((function(j){
return function() {
console.log(array[j]);
++sortedCount;
(len === sortedCount) && callback();
};
}(i)), array[i]);
}
}
if ( system.args.length < 2 ) {
console.log("Usage: phantomjs sleepsort.js PUT YOUR INTEGERS HERE SEPARATED BY SPACES");
phantom.exit(1);
} else {
sleepSort(system.args.slice(1), function() {
phantom.exit();
});
}
"use strict";
var system = require('system');
system.stdout.write('Hello, system.stdout.write!');
system.stdout.writeLine('\nHello, system.stdout.writeLine!');
system.stderr.write('Hello, system.stderr.write!');
system.stderr.writeLine('\nHello, system.stderr.writeLine!');
system.stdout.writeLine('system.stdin.readLine(): ');
var line = system.stdin.readLine();
system.stdout.writeLine(JSON.stringify(line));
// This is essentially a `readAll`
system.stdout.writeLine('system.stdin.read(5): (ctrl+D to end)');
var input = system.stdin.read(5);
system.stdout.writeLine(JSON.stringify(input));
phantom.exit(0);
// This is to be used by "module.js" (and "module.coffee") example(s).
// There should NOT be a "universe.coffee" as only 1 of the 2 would
// ever be loaded unless the file extension was specified.
"use strict";
exports.answer = 42;
exports.start = function () {
console.log('Starting the universe....');
}
// Modify global object at the page initialization.
// In this example, effectively Math.random() always returns 0.42.
"use strict";
var page = require('webpage').create();
page.onInitialized = function () {
page.evaluate(function () {
Math.random = function() {
return 42 / 100;
};
});
};
page.open('http://ariya.github.com/js/random/', function (status) {
var result;
if (status !== 'success') {
console.log('Network error.');
} else {
console.log(page.evaluate(function () {
return document.getElementById('numbers').textContent;
}));
}
phantom.exit();
});
"use strict";
var page = require('webpage').create();
console.log('The default user agent is ' + page.settings.userAgent);
page.settings.userAgent = 'SpecialAgent';
page.open('http://www.httpuseragent.org', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var ua = page.evaluate(function () {
return document.getElementById('myagent').innerText;
});
console.log(ua);
}
phantom.exit();
});
"use strict";
console.log('using PhantomJS version ' +
phantom.version.major + '.' +
phantom.version.minor + '.' +
phantom.version.patch);
phantom.exit();
/**
* Wait until the test condition is true or a timeout occurs. Useful for waiting
* on a server response or for a ui change (fadeIn, etc.) to occur.
*
* @param testFx javascript condition that evaluates to a boolean,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param onReady what to do when testFx condition is fulfilled,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
*/
"use strict";
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 250); //< repeat check every 250ms
};
var page = require('webpage').create();
// Open Twitter on 'sencha' profile and, onPageLoad, do...
page.open("http://twitter.com/#!/sencha", function (status) {
// Check for page load success
if (status !== "success") {
console.log("Unable to access network");
} else {
// Wait for 'signin-dropdown' to be visible
waitFor(function() {
// Check in the page if a specific element is now visible
return page.evaluate(function() {
return $("#signin-dropdown").is(":visible");
});
}, function() {
console.log("The sign-in dialog should be visible now.");
phantom.exit();
});
}
});
"use strict";
var p = require("webpage").create();
function pageTitle(page) {
return page.evaluate(function(){
return window.document.title;
});
}
function setPageTitle(page, newTitle) {
page.evaluate(function(newTitle){
window.document.title = newTitle;
}, newTitle);
}
p.open("../test/webpage-spec-frames/index.html", function(status) {
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToChildFrame(\"frame1\"): "+p.switchToChildFrame("frame1"));
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToChildFrame(\"frame1-2\"): "+p.switchToChildFrame("frame1-2"));
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToParentFrame(): "+p.switchToParentFrame());
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToChildFrame(0): "+p.switchToChildFrame(0));
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToMainFrame()"); p.switchToMainFrame();
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
console.log("p.switchToChildFrame(\"frame2\"): "+p.switchToChildFrame("frame2"));
console.log("pageTitle(): " + pageTitle(p));
console.log("currentFrameName(): "+p.currentFrameName());
console.log("childFramesCount(): "+p.childFramesCount());
console.log("childFramesName(): "+p.childFramesName());
console.log("setPageTitle(CURRENT TITLE+'-visited')"); setPageTitle(p, pageTitle(p) + "-visited");
console.log("");
phantom.exit();
});
This document contains the list of Third Party Software included with
PhantomJS, along with the license information.
Third Party Software may impose additional restrictions and it is the
user's responsibility to ensure that they have met the licensing
requirements of PhantomJS and the relevant license of the Third Party
Software they are using.
Qt - http://qt-project.org/
License: GNU Lesser General Public License (LGPL) version 2.1.
Reference: http://qt-project.org/doc/qt-4.8/lgpl.html.
WebKit - http://www.webkit.org/
License: GNU Lesser General Public License (LGPL) version 2.1 and BSD.
Reference: http://www.webkit.org/coding/lgpl-license.html and
http://www.webkit.org/coding/bsd-license.html.
Mongoose - https://github.com/cesanta/mongoose
License: MIT
Reference: https://github.com/cesanta/mongoose/commit/abbf27338ef554cce0281ac157aa71a9c1b82a55
OpenSSL - http://www.openssl.org/
License: OpenSSL License, SSLeay License.
Reference: http://www.openssl.org/source/license.html.
Linenoise - https://github.com/tadmarshall/linenoise
License: BSD.
Reference: https://github.com/tadmarshall/linenoise/blob/master/linenoise.h.
QCommandLine - http://xf.iksaif.net/dev/qcommandline.html
License: GNU Lesser General Public License (LGPL) version 2.1.
Reference: http://dev.iksaif.net/projects/qcommandline/repository/revisions/master/entry/COPYING
wkhtmlpdf - http://code.google.com/p/wkhtmltopdf/
License: GNU Lesser General Public License (LGPL)
Reference: http://code.google.com/p/wkhtmltopdf/
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# [PhantomJS](http://phantomjs.org) - Scriptable Headless WebKit
PhantomJS ([phantomjs.org](http://phantomjs.org)) is a headless WebKit scriptable with JavaScript. The latest [stable release](http://phantomjs.org/release-2.0.html) is version 2.0.
**Note**: Please **do not** create a GitHub pull request **without** reading the [Contribution Guide](https://github.com/ariya/phantomjs/blob/master/CONTRIBUTING.md) first. Failure to do so may result in the rejection of the pull request.
## Use Cases
- **Headless web testing**. Lightning-fast testing without the browser is now possible!
- **Page automation**. [Access and manipulate](http://phantomjs.org/page-automation.html) web pages with the standard DOM API, or with usual libraries like jQuery.
- **Screen capture**. Programmatically [capture web contents](http://phantomjs.org/screen-capture.html), including CSS, SVG and Canvas. Build server-side web graphics apps, from a screenshot service to a vector chart rasterizer.
- **Network monitoring**. Automate performance analysis, track [page loading](http://phantomjs.org/network-monitoring.html) and export as standard HAR format.
## Features
- **Multiplatform**, available on major operating systems: Windows, Mac OS X, Linux, and other Unices.
- **Fast and native implementation** of web standards: DOM, CSS, JavaScript, Canvas, and SVG. No emulation!
- **Pure headless (no X11) on Linux**, ideal for continuous integration systems. Also runs on Amazon EC2, Heroku, and Iron.io.
- **Easy to install**: [Download](http://phantomjs.org/download.html), unpack, and start having fun in just 5 minutes.
## Questions?
- Explore the complete [documentation](http://phantomjs.org/documentation/).
- Read tons of [user articles](http://phantomjs.org/buzz.html) on using PhantomJS.
- Join the [mailing-list](http://groups.google.com/group/phantomjs) and discuss with other PhantomJS fans.
PhantomJS is free software/open source, and is distributed under the [BSD license](http://opensource.org/licenses/BSD-3-Clause). It contains third-party code, see the included `third-party.txt` file for the license information on third-party code.
PhantomJS is created and maintained by [Ariya Hidayat](http://ariya.ofilabs.com/about) (Twitter: [@ariyahidayat](http://twitter.com/ariyahidayat)), with the help of [many contributors](https://github.com/ariya/phantomjs/contributors). Follow the official Twitter stream [@PhantomJS](http://twitter.com/PhantomJS) to get the frequent development updates.
//codes.js
system = require('system')
address = system.args[1];//获得命令行第二个参数 接下来会用到
//console.log('Loading a web page');
var page = require('webpage').create();
var url = address;
//console.log(url);
page.open(url, function (status) {
//Page is loaded!
if (status !== 'success') {
console.log('Unable to post!');
} else {
//console.log(page.content);
//var title = page.evaluate(function() {
// return document.title;//示范下如何使用页面的jsapi去操作页面的
// });
//console.log(title);
console.log(page.content);
}
phantom.exit();
});
"use strict";
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some args when invoking this script!');
} else {
system.args.forEach(function (arg, i) {
console.log(i + ': ' + arg);
});
}
phantom.exit();
"use strict";
var spawn = require("child_process").spawn
var execFile = require("child_process").execFile
var child = spawn("ls", ["-lF", "/rooot"])
child.stdout.on("data", function (data) {
console.log("spawnSTDOUT:", JSON.stringify(data))
})
child.stderr.on("data", function (data) {
console.log("spawnSTDERR:", JSON.stringify(data))
})
child.on("exit", function (code) {
console.log("spawnEXIT:", code)
})
//child.kill("SIGKILL")
execFile("ls", ["-lF", "/usr"], null, function (err, stdout, stderr) {
console.log("execFileSTDOUT:", JSON.stringify(stdout))
console.log("execFileSTDERR:", JSON.stringify(stderr))
})
setTimeout(function () {
phantom.exit(0)
}, 2000)
//codes.js
system = require('system')
address = system.args[1];//获得命令行第二个参数 接下来会用到
//console.log('Loading a web page');
var page = require('webpage').create();
var url = address;
//console.log(url);
page.open(url, function (status) {
//Page is loaded!
if (status !== 'success') {
console.log('Unable to post!');
} else {
//console.log(page.content);
//var title = page.evaluate(function() {
// return document.title;//示范下如何使用页面的jsapi去操作页面的
// });
//console.log(title);
console.log(page.content);
}
phantom.exit();
});
"use strict";
var page = require('webpage').create();
page.viewportSize = { width: 400, height : 400 };
page.content = '<html><body><canvas id="surface"></canvas></body></html>';
page.evaluate(function() {
var el = document.getElementById('surface'),
context = el.getContext('2d'),
width = window.innerWidth,
height = window.innerHeight,
cx = width / 2,
cy = height / 2,
radius = width / 2.3,
imageData,
pixels,
hue, sat, value,
i = 0, x, y, rx, ry, d,
f, g, p, u, v, w, rgb;
el.width = width;
el.height = height;
imageData = context.createImageData(width, height);
pixels = imageData.data;
for (y = 0; y < height; y = y + 1) {
for (x = 0; x < width; x = x + 1, i = i + 4) {
rx = x - cx;
ry = y - cy;
d = rx * rx + ry * ry;
if (d < radius * radius) {
hue = 6 * (Math.atan2(ry, rx) + Math.PI) / (2 * Math.PI);
sat = Math.sqrt(d) / radius;
g = Math.floor(hue);
f = hue - g;
u = 255 * (1 - sat);
v = 255 * (1 - sat * f);
w = 255 * (1 - sat * (1 - f));
pixels[i] = [255, v, u, u, w, 255, 255][g];
pixels[i + 1] = [w, 255, 255, v, u, u, w][g];
pixels[i + 2] = [u, u, w, 255, 255, v, u][g];
pixels[i + 3] = 255;
}
}
}
context.putImageData(imageData, 0, 0);
document.body.style.backgroundColor = 'white';
document.body.style.margin = '0px';
});
page.render('colorwheel.png');
phantom.exit();
"use strict";
var t = 10,
interval = setInterval(function(){
if ( t > 0 ) {
console.log(t--);
} else {
console.log("BLAST OFF!");
phantom.exit();
}
}, 1000);
// Detect if a web page sniffs the user agent or not.
"use strict";
var page = require('webpage').create(),
system = require('system'),
sniffed,
address;
page.onInitialized = function () {
page.evaluate(function () {
(function () {
var userAgent = window.navigator.userAgent,
platform = window.navigator.platform;
window.navigator = {
appCodeName: 'Mozilla',
appName: 'Netscape',
cookieEnabled: false,
sniffed: false
};
window.navigator.__defineGetter__('userAgent', function () {
window.navigator.sniffed = true;
return userAgent;
});
window.navigator.__defineGetter__('platform', function () {
window.navigator.sniffed = true;
return platform;
});
})();
});
};
if (system.args.length === 1) {
console.log('Usage: detectsniff.js <some URL>');
phantom.exit(1);
} else {
address = system.args[1];
console.log('Checking ' + address + '...');
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit();
} else {
window.setTimeout(function () {
sniffed = page.evaluate(function () {
return navigator.sniffed;
});
if (sniffed) {
console.log('The page tried to sniff the user agent.');
} else {
console.log('The page did not try to sniff the user agent.');
}
phantom.exit();
}, 1500);
}
});
}
// echoToFile.js - Write in a given file all the parameters passed on the CLI
"use strict";
var fs = require('fs'),
system = require('system');
if (system.args.length < 3) {
console.log("Usage: echoToFile.js DESTINATION_FILE <arguments to echo...>");
phantom.exit(1);
} else {
var content = '',
f = null,
i;
for ( i= 2; i < system.args.length; ++i ) {
content += system.args[i] + (i === system.args.length-1 ? '' : ' ');
}
try {
fs.write(system.args[1], content, 'w');
} catch(e) {
console.log(e);
}
phantom.exit();
}
"use strict";
var feature, supported = [], unsupported = [];
phantom.injectJs('modernizr.js');
console.log('Detected features (using Modernizr ' + Modernizr._version + '):');
for (feature in Modernizr) {
if (Modernizr.hasOwnProperty(feature)) {
if (feature[0] !== '_' && typeof Modernizr[feature] !== 'function' &&
feature !== 'input' && feature !== 'inputtypes') {
if (Modernizr[feature]) {
supported.push(feature);
} else {
unsupported.push(feature);
}
}
}
}
console.log('');
console.log('Supported:');
supported.forEach(function (e) {
console.log(' ' + e);
});
console.log('');
console.log('Not supported:');
unsupported.forEach(function (e) {
console.log(' ' + e);
});
phantom.exit();
"use strict";
var fibs = [0, 1];
var ticker = window.setInterval(function () {
console.log(fibs[fibs.length - 1]);
fibs.push(fibs[fibs.length - 1] + fibs[fibs.length - 2]);
if (fibs.length > 10) {
window.clearInterval(ticker);
phantom.exit();
}
}, 300);
//codes.js
system = require('system')
address = system.args[1];//获得命令行第二个参数 接下来会用到
//console.log('Loading a web page');
var page = require('webpage').create();
var url = address;
var headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
//console.log(url);
page.open(url, headers,function (status) {
//Page is loaded!
if (status !== 'success') {
console.log('Unable to post!');
} else {
//console.log(page.content);
//var title = page.evaluate(function() {
// return document.title;//示范下如何使用页面的jsapi去操作页面的
// });
//console.log(title);
console.log(page.content);
}
phantom.exit();
});
// Use 'page.injectJs()' to load the script itself in the Page context
"use strict";
if ( typeof(phantom) !== "undefined" ) {
var page = require('webpage').create();
// Route "console.log()" calls from within the Page context to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.onAlert = function(msg) {
console.log(msg);
};
console.log("* Script running in the Phantom context.");
console.log("* Script will 'inject' itself in a page...");
page.open("about:blank", function(status) {
if ( status === "success" ) {
console.log(page.injectJs("injectme.js") ? "... done injecting itself!" : "... fail! Check the $PWD?!");
}
phantom.exit();
});
} else {
alert("* Script running in the Page context.");
}
"use strict";
var page = require('webpage').create(),
system = require('system'),
t, address;
if (system.args.length === 1) {
console.log('Usage: loadspeed.js <some URL>');
phantom.exit(1);
} else {
t = Date.now();
address = system.args[1];
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
} else {
t = Date.now() - t;
console.log('Page title is ' + page.evaluate(function () {
return document.title;
}));
console.log('Loading time ' + t + ' msec');
}
phantom.exit();
});
}
"use strict";
var page = require('webpage').create(),
system = require('system');
if (system.args.length < 2) {
console.log('Usage: loadurlwithoutcss.js URL');
phantom.exit();
}
var address = system.args[1];
page.onResourceRequested = function(requestData, request) {
if ((/http:\/\/.+?\.css/gi).test(requestData['url']) || requestData.headers['Content-Type'] == 'text/css') {
console.log('The url of the request is matching. Aborting: ' + requestData['url']);
request.abort();
}
};
page.open(address, function(status) {
if (status === 'success') {
phantom.exit();
} else {
console.log('Unable to load the address!');
phantom.exit();
}
});
\ No newline at end of file
var page = require('webpage').create(),
system = require('system'),
url, output, size;
url = system.args[1];
output = system.args[2];
page.open(url, function (status)
{
if (status != "success")
{
console.log('FAIL to load the address');
phantom.exit();
}
page.evaluate(function()
{
//此函数在目标页面执行的,上下文环境非本phantomjs,所以不能用到这个js中其他变量
window.scrollTo(0,10000);//滚动到底部
//window.document.body.scrollTop = document.body.scrollHeight;
output="shuqi.png";
window.setTimeout(function()
{
var plist = document.querySelectorAll(".header");
var len = plist.length;
while(len)
{
len--;
var el = plist[len];
if(len==3){
el.click();
break;
}
}
},5000);
});
window.setTimeout(function ()
{
page.render(output);
phantom.exit();
}, 5000+500);
});
\ No newline at end of file
var page = require('webpage').create(),
system = require('system'),
url, output, size;
url = system.args[1];
output = system.args[2];
var flag = false;
if(url.indexOf("shuqi")!=-1){
flag=true;
}
page.open(url, function (status)
{
if (status != "success")
{
console.log('FAIL to load the url');
phantom.exit();
}
if(url.indexOf("shuqi")!=-1){
}
page.evaluate(function(url)
{
//此函数在目标页面执行的,上下文环境非本phantomjs,所以不能用到这个js中其他变量
window.scrollTo(0,10000);//滚动到底部
//window.document.body.scrollTop = document.body.scrollHeight;
if(url.indexOf("shuqi")!=-1){
window.setTimeout(function()
{
var plist = document.querySelectorAll(".header");
var len = plist.length;
while(len)
{
len--;
var el = plist[len];
if(len==3){
el.click();
break;
}
}
},5000);
}
},url);
window.setTimeout(function ()
{
page.render(output);
phantom.exit();
}, 5000+500);
});
\ No newline at end of file
"use strict";
var universe = require('./universe');
universe.start();
console.log('The answer is ' + universe.answer);
phantom.exit();
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论