提交 66d65ac8 作者: 刘伟刚

采集代码更新11

上级 6ebd5e6b
......@@ -7,26 +7,32 @@
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="platdata2wsb" />
<module name="sina_search" />
<module name="so_crawler" />
<module name="twitter_crawler" />
<module name="haiguan" />
<module name="google_crawler" />
<module name="platdata2wsb" />
<module name="comm_crawler" />
<module name="yahoo" />
<module name="so_crawler" />
<module name="weibo_crawler" />
<module name="baidu_search" />
<module name="baidu_crawler" />
<module name="sougou_crawler" />
<module name="twitter_crawler" />
<module name="haiguan" />
<module name="google_crawler" />
<module name="baidu_mcrawler" />
</profile>
</annotationProcessing>
<bytecodeTargetLevel>
<module name="baidu_crawler (1)" target="1.8" />
<module name="clb_crawler" target="1.8" />
<module name="demo" target="1.8" />
</bytecodeTargetLevel>
</component>
<component name="JavacSettings">
<option name="ADDITIONAL_OPTIONS_OVERRIDE">
<module name="baidu_crawler" options="-parameters" />
<module name="baidu_crawler (1)" options="-parameters" />
<module name="baidu_mcrawler" options="-parameters" />
<module name="baidu_search" options="-parameters" />
<module name="comm_crawler" options="-parameters" />
<module name="google_crawler" options="-parameters" />
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/baidu_crawler/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/baidu_mcrawler/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/baidu_search/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/comm_crawler/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/google_crawler/src/main/java" charset="UTF-8" />
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$/baidu_crawler" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
<option value="$PROJECT_DIR$/baidu_mcrawler/pom.xml" />
</list>
</option>
<option name="ignoredFiles">
......
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.5.4</version>
<relativePath/>
</parent>
<groupId>com.zzsn</groupId>
<artifactId>baidu_crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>war</packaging>
<name>baidu_crawler Tapestry 5 Application</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spring-boot-version>2.5.4</spring-boot-version>
<tapestry-version>5.8.2</tapestry-version>
<jackson-version>2.13.1</jackson-version>
<log4j-version>2.17.2</log4j-version>
<json-version>1.1.4</json-version>
<junit-version>5.8.2</junit-version>
<yasson-version>2.0.4</yasson-version>
<servlet-version>3.1.0</servlet-version>
<maven-compiler-version>3.8.1</maven-compiler-version>
<maven-surefire-version>3.0.0-M5</maven-surefire-version>
</properties>
<repositories>
<repository>
<id>apache-staging</id>
<url>https://repository.apache.org/content/repositories/staging/</url>
</repository>
</repositories>
<dependencies>
<!-- Spring -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-log4j2</artifactId>
<scope>compile</scope>
</dependency>
<!-- Apache Tapestry -->
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-core</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
<!-- CoffeeScript & Less support, plus resource minification -->
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-webresources</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
<!-- Uncomment this to add support for spring, hibernate, bean validation and file uploads -->
<!--
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-spring</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-hibernate</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-beanvalidator</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-upload</artifactId>
<version>${tapestry-version}</version>
<scope>compile</scope>
</dependency>
-->
<!-- Unit Testing -->
<dependency>
<groupId>org.apache.tapestry</groupId>
<artifactId>tapestry-test</artifactId>
<version>${tapestry-version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-version}</version>
<scope>test</scope>
</dependency>
<!-- Miscellaneous -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j-version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j-version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4j-version}</version>
</dependency>
<dependency>
<groupId>org.eclipse</groupId>
<artifactId>yasson</artifactId>
<version>${yasson-version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson-version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson-version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-yaml</artifactId>
<version>${jackson-version}</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>${servlet-version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.0.0</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.5.0</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler-version}</version>
<configuration>
<release>11</release>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>${maven-surefire-version}</version>
<configuration>
<systemPropertyVariables>
<propertyName>firefox</propertyName>
</systemPropertyVariables>
</configuration>
<dependencies>
<dependency>
<groupId>org.apache.maven.surefire</groupId>
<artifactId>surefire-junit-platform</artifactId>
<version>${maven-surefire-version}</version>
</dependency>
<dependency>
<groupId>org.apache.maven.surefire</groupId>
<artifactId>surefire-testng</artifactId>
<version>${maven-surefire-version}</version>
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
package com.zzsn;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplication extends SpringBootServletInitializer {
@Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerMateSearchApplication.class);
}
public static void main(String[] args) {
SpringApplication.run(CrawlerMateSearchApplication.class, args);
}
}
package com.zzsn.cache;
import com.alibaba.druid.util.StringUtils;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Set;
public class JedisFactory {
private static final Logger Log = LoggerFactory.getLogger(JedisFactory.class);
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
public static void init(){
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXIDLE;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
public static String getKeyStr(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
return null;
}
return jedis.get( key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
private static Jedis getJedis() {
return jedisPool.getResource();
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeyStr(String key, String value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static boolean setKeyStrtime(String key, String value,Integer time) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
if (time > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(key, time);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static Set<String> getKeySet(String key) {
Jedis jedis=getJedis();
try {
Set<String> obj = jedis.smembers(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeySet(String key, Set<String> value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if(value==null){
return false;
}
jedis.sadd(key, value.toArray(new String[value.size()]));
} catch (Exception e){
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static void del(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
return;
}
jedis.del(key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
}
}
package com.zzsn.cache;
import com.alibaba.druid.util.StringUtils;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.List;
import java.util.Set;
public class JedisUtil {
private static final String PREFIX = "baidu_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
private JedisUtil() {
init();
}
//写成静态代码块形式,只加载一次,节省资源
// static {
//
// }
/**
* 从jedis连接池中获取获取jedis对象
*
* @return
*/
private static void init(){
// Properties properties = PropertyUtil.loadProperties("redis.properties");
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXTOTAL;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
if(pass.equals("xxxxxx")){
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}else{
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout),pass);
}
}
private static Jedis getJedis() {
// init();
return jedisPool.getResource();
}
private static final JedisUtil jedisUtil = new JedisUtil();
/**
* 获取JedisUtil实例
*
* @return
*/
public static JedisUtil getInstance() {
return jedisUtil;
}
public static void returnResource(final Jedis jedis) {
if (jedis != null && jedisPool != null) {
jedis.close();
}
}
public static Jedis getDefaultJedis() {
// return getJedis(HOST_IP, HOST_PORT);//简装版
return getJedis();
}
/**
* 根据 pattern 获取 redis 中的键
*/
public static Set<String> getKeysByPattern(String pattern) {
return getDefaultJedis().keys(pattern);
}
public static boolean exists(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
throw new Exception("key is null");
}
return getDefaultJedis().exists(PREFIX + key);
}
public static void del(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(PREFIX + key);
}
public static void setString(String key, String value, int expireTime) throws Exception {
Jedis jedis=null;
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
String finalKey = PREFIX + key;
jedis = getDefaultJedis();
jedis.set(finalKey, value);
if (expireTime > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(finalKey, expireTime);
}
}catch (Exception e){
}finally {
returnResource(jedis);
}
}
public static String getString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(PREFIX + key);
}
public static String getNoPrefixString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().setnx(PREFIX + key, value);
}
public static long expire(String key, int seconds) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().expire(PREFIX + key, seconds);
}
public static void pushList(String key, String value, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
/**
* key代表的是链表的名字 List是一个双端链表,lpush是往链表的头部插入一条数据,rpush是往尾部插入一条数据
*/
if (flag.equalsIgnoreCase("L")) {
getDefaultJedis().lpush(PREFIX + key, value);
} else if (flag.equalsIgnoreCase("R")) {
getDefaultJedis().rpush(PREFIX + key, value);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
public static String popList(String key, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
if (flag.equalsIgnoreCase("L")) {
return getDefaultJedis().lpop(PREFIX + key);
} else if (flag.equalsIgnoreCase("R")) {
return getDefaultJedis().rpop(PREFIX + key);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
/**
* 获取 List 中指定区间上的元素
*/
public static List<String> getAppointedList(String key, long start, long end) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, start, end);
}
public static List<String> getList(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, 0, -1);
}
public static void sadd(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
Jedis defaultJedis=null;
try {
defaultJedis = getDefaultJedis();
Long sadd = defaultJedis.sadd(key, value);
}catch (Exception e){
}finally {
if(defaultJedis!=null){
returnResource(defaultJedis);
}
}
}
public static boolean sismember(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
Jedis defaultJedis=null;
Boolean aBoolean=false;
try {
defaultJedis = getDefaultJedis();
aBoolean = defaultJedis.sismember(key, value);
}catch (Exception e){
}finally {
returnResource(defaultJedis);
}
return aBoolean;
}
}
package com.zzsn.cache;
import io.protostuff.*;
import io.protostuff.runtime.RuntimeSchema;
import org.springframework.objenesis.Objenesis;
import org.springframework.objenesis.ObjenesisStd;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @Author Sugar
* @Version 2018/3/16 13:32
*/
public class ProtostuffUtil {
public static final Objenesis objenesis = new ObjenesisStd(true);
private static final String ERR_TRUNCATED_MESSAGE =
"While parsing a protocol message, the input ended unexpectedly " +
"in the middle of a field. This could mean either than the " +
"input has been truncated or that an embedded message " +
"misreported its own length.";
/**
* 序列化
*
* @param obj
* @return
*/
public static <T> byte[] serializer(T obj) {
LinkedBuffer buffer = LinkedBuffer.allocate(LinkedBuffer.DEFAULT_BUFFER_SIZE);
try {
Schema<T> schema = RuntimeSchema.getSchema((Class<T>) obj.getClass());
return ProtostuffIOUtil.toByteArray(obj, schema, buffer);
} catch (Exception e) {
throw new IllegalStateException("序列化对象失败:" + obj, e);
} finally {
buffer.clear();
}
}
/**
* 反序列化
*
* @param data
* @param clazz
* @return
*/
public static <T> T deserializer(byte[] data, Class<T> clazz) {
T obj = null;
try {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
// obj = schema.newMessage();
obj = objenesis.newInstance(clazz);
ProtostuffIOUtil.mergeFrom(data, obj, schema);
} catch (Exception e) {
throw new IllegalStateException("反序列化对象失败:class=" + clazz + ", data=" + new String(data), e);
}
return obj;
}
public static <T> byte[] serializeList(List<T> list) {
@SuppressWarnings("unchecked")
Schema<T> schema = (Schema<T>) RuntimeSchema.getSchema(list.get(0).getClass());
LinkedBuffer buffer = LinkedBuffer.allocate(1024 * 1024);
ByteArrayOutputStream bos = null;
try {
bos = new ByteArrayOutputStream();
ProtostuffIOUtil.writeListTo(bos, list, schema, buffer);
return bos.toByteArray();
} catch (Exception e) {
throw new IllegalStateException("序列化对象列表失败:" + list, e);
} finally {
buffer.clear();
try {
if (bos != null) {
bos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 反序列化对象列表
*
* @param data
* @param clazz
* @param <T>
* @return
*/
public static <T> List<T> deserializeList(byte[] data, Class<T> clazz) {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
List<T> result = null;
try {
result = parseListFrom(new ByteArrayInputStream(data), schema, clazz);
} catch (IOException e) {
throw new IllegalStateException("反序列化对象列表失败:class=" + clazz + ", data=" + new String(data), e);
}
return result;
}
private static <T> List<T> parseListFrom(final InputStream in, final Schema<T> schema, Class<T> clazz)
throws IOException {
int size = in.read();
if (size == -1) {
return Collections.emptyList();
}
if (size > 0x7f) {
size = readRawVarint32(in, size);
}
final ArrayList<T> list = new ArrayList<T>(size);
final CodedInput input = new CodedInput(in, true);
for (int i = 0; i < size; i++) {
// final T message = schema.newMessage();
final T message = objenesis.newInstance(clazz);//使用objensis代替newInstance()
list.add(message);
schema.mergeFrom(input, message);
input.checkLastTagWas(0);
}
assert in.read() == -1;
return list;
}
/**
* Reads a varint from the input one byte at a time, so that it does not read any bytes after the end of the varint.
* If you simply wrapped the stream in a CodedInput and used readRawVarint32(InputStream) then you would
* probably end up reading past the end of the varint since CodedInput buffers its input.
*/
private static int readRawVarint32(final InputStream input, final int firstByte) throws IOException {
int result = firstByte & 0x7f;
int offset = 7;
for (; offset < 32; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
result |= (b & 0x7f) << offset;
if ((b & 0x80) == 0) {
return result;
}
}
// Keep reading up to 64 bits.
for (; offset < 64; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
if ((b & 0x80) == 0) {
return result;
}
}
throw new ProtobufException(
"CodedInput encountered a malformed varint.");
}
}
package com.zzsn.conf;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.web.servlet.MultipartAutoConfiguration;
import org.springframework.boot.web.servlet.MultipartConfigFactory;
import org.springframework.context.annotation.Bean;
import javax.servlet.MultipartConfigElement;
import java.io.File;
@SpringBootApplication(exclude = {MultipartAutoConfiguration.class})
public class SpringBootFileuploadApplication {
@Bean
MultipartConfigElement multipartConfigElement() {
MultipartConfigFactory factory = new MultipartConfigFactory();
String location = System.getProperty("user.dir") + "/data/tmp";
File tmpFile = new File(location);
if (!tmpFile.exists()) {
tmpFile.mkdirs();
}
factory.setLocation(location);
return factory.createMultipartConfig();
}
}
package com.zzsn.conf;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
@Configuration
@EnableAsync
public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor")
public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "asyncTaskExecutorSelenium")
public Executor executorSelenium() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(2);//线程池维护线程的最少数量
executor.setMaxPoolSize(4);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "webExecutor")
public Executor webExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "detailExecutor")
public Executor detailExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
// @Bean(value = "asyncexecutorService")
// public Executor executorService() {
// ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
// executor.setCorePoolSize(1);//线程池维护线程的最少数量
// executor.setMaxPoolSize(1);//线程池维护线程的最大数量
// executor.setQueueCapacity(100000);//缓存队列
// executor.setThreadNamePrefix("selenium-");
// /**
// * 对拒绝task的处理策略
// rejection-policy:当pool已经达到max size的时候,如何处理新任务
// CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
// */
// executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
// executor.setKeepAliveSeconds(60*60);//允许的空闲时间
// executor.initialize();
// return executor;
// }
// @Bean(value = "asyncexecutorServiceWebBaidu")
// public Executor executorServiceWebBaidu() {
// ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
// executor.setCorePoolSize(5);//线程池维护线程的最少数量
// executor.setMaxPoolSize(5);//线程池维护线程的最大数量
// executor.setQueueCapacity(100000);//缓存队列
// executor.setThreadNamePrefix("selenium-");
// /**
// * 对拒绝task的处理策略
// rejection-policy:当pool已经达到max size的时候,如何处理新任务
// CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
// */
// executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
// executor.setKeepAliveSeconds(60*60);//允许的空闲时间
// executor.initialize();
// return executor;
// }
// @Bean(value = "asyncexecutorServiceDetailUrl")
// public Executor asyncexecutorServiceDetailUrl() {
// ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
// executor.setCorePoolSize(5);//线程池维护线程的最少数量
// executor.setMaxPoolSize(5);//线程池维护线程的最大数量
// executor.setQueueCapacity(100000);//缓存队列
// executor.setThreadNamePrefix("selenium-");
// /**
// * 对拒绝task的处理策略
// rejection-policy:当pool已经达到max size的时候,如何处理新任务
// CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
// */
// executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
// executor.setKeepAliveSeconds(60*60);//允许的空闲时间
// executor.initialize();
// return executor;
// }
}
package com.zzsn.docinfo;
import java.io.Serializable;
import java.util.Map;
/**
* 数据接口文档
* 创建人:李东亮
* 创建时间:2016-4-6 下午3:44:17
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DocInfo implements Serializable{
private String contentType;
private Long orgId;
private Long sid;
//News:新闻,BBS:论坛,Blog:博客,MicroBlog:微博,WeChat:微信,Video:视频,Other:其他
private String sourceType;
private String lastModified;
private String charset;
private String sourceaddress;
private String lang;
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
private String fileDownLoadPath;
private Map<String,String> otherParams;
public Long getOrgId() {
return orgId;
}
public void setOrgId(Long orgId) {
this.orgId = orgId;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public Long getSid() {
return sid;
}
public void setSid(Long sid) {
this.sid = sid;
}
public String getSourceType() {
return sourceType;
}
public void setSourceType(String sourceType) {
this.sourceType = sourceType;
}
public String getLastModified() {
return lastModified;
}
public void setLastModified(String lastModified) {
this.lastModified = lastModified;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
public String getSourceaddress() {
return sourceaddress;
}
public void setSourceaddress(String sourceaddress) {
this.sourceaddress = sourceaddress;
}
public String getLang() {
return lang;
}
public void setLang(String lang) {
this.lang = lang;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getOrigin() {
return origin;
}
public void setOrigin(String origin) {
this.origin = origin;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContentWithTag() {
return contentWithTag;
}
public void setContentWithTag(String contentWithTag) {
this.contentWithTag = contentWithTag;
}
public String getContentNoTag() {
return contentNoTag;
}
public void setContentNoTag(String contentNoTag) {
this.contentNoTag = contentNoTag;
}
public Map<String, String> getOtherParams() {
return otherParams;
}
public void setOtherParams(Map<String, String> otherParams) {
this.otherParams = otherParams;
}
public String getFileDownLoadPath() {
return fileDownLoadPath;
}
public void setFileDownLoadPath(String fileDownLoadPath) {
this.fileDownLoadPath = fileDownLoadPath;
}
public String getContentImgCvtTag() {
return contentImgCvtTag;
}
public void setContentImgCvtTag(String contentImgCvtTag) {
this.contentImgCvtTag = contentImgCvtTag;
}
}
package com.zzsn.entity;
import lombok.Data;
import java.util.Date;
@Data
public class BadSiteMsg {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private String crawlerType;
/**分区id (多个用英文逗号隔开)*/
private String partition;
/**消费时间*/
private Date consumerDate;
}
package com.zzsn.entity;
import java.io.Serializable;
public class SiteTemplate implements Serializable {
private static final long serialVersionUID = 1L;
String sourceId;
String siteName;
String siteHost;
String siteLang;
String matchUrl;
String matchTitle;
String matchSummary;
String matchContent;
String matchPublishDate;
String matchOrigin;
String matchAuthor;
String excloume1;
String excloume2;
public SiteTemplate() {
super();
}
public SiteTemplate(String siteName, String siteHost, String siteLang, String matchUrl, String matchTitle,
String matchSummary, String matchContent, String matchPublishDate, String matchOrigin, String matchAuthor,
String excloume1, String excloume2) {
super();
this.siteName = siteName;
this.siteHost = siteHost;
this.siteLang = siteLang;
this.matchUrl = matchUrl;
this.matchTitle = matchTitle;
this.matchSummary = matchSummary;
this.matchContent = matchContent;
this.matchPublishDate = matchPublishDate;
this.matchOrigin = matchOrigin;
this.matchAuthor = matchAuthor;
this.excloume1 = excloume1;
this.excloume2 = excloume2;
}
public String getSourceId() {
return sourceId;
}
public void setSourceId(String sourceId) {
this.sourceId = sourceId;
}
public String getSiteName() {
return siteName;
}
public void setSiteName(String siteName) {
this.siteName = siteName;
}
public String getSiteHost() {
return siteHost;
}
public void setSiteHost(String siteHost) {
this.siteHost = siteHost;
}
public String getSiteLang() {
return siteLang;
}
public void setSiteLang(String siteLang) {
this.siteLang = siteLang;
}
public String getMatchUrl() {
return matchUrl;
}
public void setMatchUrl(String matchUrl) {
this.matchUrl = matchUrl;
}
public String getMatchTitle() {
return matchTitle;
}
public void setMatchTitle(String matchTitle) {
this.matchTitle = matchTitle;
}
public String getMatchSummary() {
return matchSummary;
}
public void setMatchSummary(String matchSummary) {
this.matchSummary = matchSummary;
}
public String getMatchContent() {
return matchContent;
}
public void setMatchContent(String matchContent) {
this.matchContent = matchContent;
}
public String getMatchPublishDate() {
return matchPublishDate;
}
public void setMatchPublishDate(String matchPublishDate) {
this.matchPublishDate = matchPublishDate;
}
public String getMatchOrigin() {
return matchOrigin;
}
public void setMatchOrigin(String matchOrigin) {
this.matchOrigin = matchOrigin;
}
public String getMatchAuthor() {
return matchAuthor;
}
public void setMatchAuthor(String matchAuthor) {
this.matchAuthor = matchAuthor;
}
@Override
public String toString() {
return "SiteTemplate [sourceId=" + sourceId + ", siteName=" + siteName + ", siteHost=" + siteHost
+ ", siteLang=" + siteLang + ", matchUrl=" + matchUrl + ", matchTitle=" + matchTitle + ", matchSummary="
+ matchSummary + ", matchContent=" + matchContent + ", matchPublishDate=" + matchPublishDate
+ ", matchOrigin=" + matchOrigin + ", matchAuthor=" + matchAuthor + ", excloume1=" + excloume1
+ ", excloume2=" + excloume2 + "]";
}
}
package com.zzsn.job;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* 阻塞线程池
* 线程池的线程数到达最大线程数阻塞等待
* 可用于多线程获取MQ消息任务
* 因为会阻塞,就不用考虑拒绝策略这一块的重写
*/
public class BlockThreadPoolExecute extends ThreadPoolExecutor {
private ReentrantLock lock = new ReentrantLock();
private Condition condition = this.lock.newCondition();
public BlockThreadPoolExecute(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
@Override
public void execute(Runnable command) {
//进行同步锁定
this.lock.lock();
super.execute(command);
try {
//如果线程池的数量已经达到最大线程池的数量,则进行挂起操作
if (getPoolSize() == getMaximumPoolSize()) {
this.condition.await();
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
this.lock.unlock();
}
}
@Override
protected void afterExecute(Runnable r, Throwable t) {
try{
lock.lock();
this.condition.signal();
}finally {
this.lock.unlock();
}
}
}
\ No newline at end of file
package com.zzsn.job;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSON;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.FileUtil;
import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Component
@EnableScheduling
@Slf4j
public class KafkaConsumerJob {
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
/**fixedDelay:上一次执行完毕时间点之后一分钟再执行*/
// @Scheduled(cron = "0 0/2 * * * ?")
// @Scheduled(fixedDelay=60000)
public void consumer (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
continue;
}
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
// @Scheduled(cron = "0 0/2 * * * ?")
public void consumerPartition (){
try {
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try {
while (true) {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
for (ConsumerRecord record : records) {
KeywordMsg keywordMsg=null;
try {
keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
}catch (Exception e){
log.info("关键词组解析异常:"+record.value().toString());
}
consumer.commitSync();
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}
}
} catch (Exception e) {
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}catch (Exception e){
log.info("kafka调用信息失败");
}
}
// @Scheduled(fixedRate = 1000*60*60*20)
// @Scheduled(fixedRate = 1000*60*10)
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
log.info("——————++++++++++++——————==="+"KEY_WORDS_TO_REDIS::"+keysite);
log.info("关键词请求开始++++"+DateTime.now());
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = JSON.parseObject(subvalue, KeywordMsg.class);
if(keywordMsg.getSearchEngines().size()>1){
String engines=keywordMsg.getSearchEngines().get(1);
if(!engines.contains("3")) {
continue;
}
}
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++"+ DateTime.now());
}catch (Exception e){
log.info("关键词解析异常++++"+ e.getMessage());
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
}
package com.zzsn.job;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.StringSerializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.core.ProducerFactory;
import java.util.HashMap;
import java.util.Map;
/**
* kafka生产者配置
* @author Java小白
*
*/
@Configuration
@EnableKafka
public class KafkaProducerConfig {
@Value("${kafka.producer.servers}")
private String servers;
@Value("${kafka.producer.retries}")
private int retries;
@Value("${kafka.producer.batch.size}")
private int batchSize;
@Value("${kafka.producer.linger}")
private int linger;
@Value("${kafka.producer.buffer.memory}")
private int bufferMemory;
/**
* 配置生产者信息(消费提供者信息)
* @return
*/
public Map<String, Object> producerConfigs() {
Map<String, Object> props = new HashMap<>();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, servers);
props.put(ProducerConfig.RETRIES_CONFIG, retries);
props.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize);
props.put(ProducerConfig.LINGER_MS_CONFIG, linger);
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, bufferMemory);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
return props;
}
/**
* 消费工厂
* @return
*/
public ProducerFactory<String, String> producerFactory() {
return new DefaultKafkaProducerFactory<>(producerConfigs());
}
/**
* 消息发送工具类
* @return
*/
@Bean
public KafkaTemplate<String, String> kafkaTemplate() {
//需要指定消费工厂
return new KafkaTemplate<String, String>(producerFactory());
}
}
\ No newline at end of file
package com.zzsn.job;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertyUtil {
//加载property文件到io流里面
public static Properties loadProperties(String propertyFile) {
Properties properties = new Properties();
try {
InputStream is = PropertyUtil.class.getClassLoader().getResourceAsStream(propertyFile);
if(is == null){
is = PropertyUtil.class.getClassLoader().getResourceAsStream("properties/" + propertyFile);
}
properties.load(is);
} catch (IOException e) {
e.printStackTrace();
}
return properties;
}
/**
* 根据key值取得对应的value值
*
* @param key
* @return
*/
public static String getValue(String propertyFile, String key) {
Properties properties = loadProperties(propertyFile);
return properties.getProperty(key);
}
}
package com.zzsn.paser;
import com.zzsn.utility.index.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
public class SeleniumTime {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try{
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
Thread.sleep(3000l);
String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l);
driver.quit();
// System.out.println(html);
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(3000l);
String html = driver.getPageSource();
Thread.sleep(5000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}
try {
Thread.sleep(5000l);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
driver.quit();
return null;
}
public static void main(String[] args) {
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.zzsn.paser;
import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.Site;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.extractor.DefaultMsg;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utility.util.Utility;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class SourceTemplateByTag {
// 从缓存中获取对应的模板
// public SiteTemplate getSiteTemp(String infourl){
// SiteTemplate sTemplate=new SiteTemplate();
// try {
// String domain=new URL(infourl).getHost();
// Site site=(Site) MemcachedUtils.get("domainUri_"+domain);
//
// if (null != site.getMatchTitle()) {
// sTemplate.setMatchTitle(site.getMatchTitle());
// }
// if (null != site.getMatchSummary()) {
// sTemplate.setMatchSummary(site.getMatchSummary());
// }
// if (null != site.getMatchContent()) {
// sTemplate.setMatchContent(site.getMatchContent());
// }
// if (null != site.getMatchTitle()) {
// sTemplate.setMatchAuthor(site.getMatchAuthor());
//
// }
// if (null != site.getMatchOrigin()) {
// sTemplate.setMatchOrigin(site.getMatchOrigin());
// }
// if (null != site.getMatchPublishDate()) {
// sTemplate.setMatchPublishDate(site.getMatchPublishDate());
// }
//
// }catch (Exception e){
// return null;
// }
// return sTemplate;
// }
static List<Site> sList=new ArrayList<Site>();
//保存有问题的站点
// public static void saveNoTempSite( CatchWebByMetaSearch cwbm ){
// try {
//
// String infourl = cwbm.getSourceaddress();
// String domainurl = new URL(infourl).getHost();
// Site site = new Site();
// site.setDomainUri(domainurl);
// site.setUri(infourl);
// site.setName(cwbm.getSourcesite());
//
// Object cacheObj=MemcachedUtils.get("tempSite");
// if (null==cacheObj ) {
// List<Site> s2List=new ArrayList<Site>();
// sList=s2List;
// }
// sList.add(site);
// MemcachedUtils.set("tempSite", sList,60*60*24);
// System.out.println("保存成功!!");
// }catch (Exception e){
// e.printStackTrace();
// System.out.println("保存失败");
// }
// }
public static DocInfo doPaserByTag(String htmlContent, DocInfo docInfo, SiteTemplate siteTemplate){
DefaultMsg dm = new DefaultMsg();
Document doc = Jsoup.parse(htmlContent);
System.out.println("===========doPaserByTag");
if(null!=siteTemplate.getMatchTitle()&&siteTemplate.getMatchTitle().length()>0) {
//标题
String title =paseElementByCSS(doc,siteTemplate.getMatchTitle());
if (StringUtils.isNotEmpty(title)) {
docInfo.setTitle(title.replace("...", ""));
}
}
if(null!=siteTemplate.getMatchContent()&&siteTemplate.getMatchContent().length()>0) {
Elements elementsByTag = doc.select(siteTemplate.getMatchContent());
String contentWithTag = Utility.RemoveUselessHTMLTagX(elementsByTag.html());
System.out.println("==========="+elementsByTag);
// String contentWithTag =paseElementByCSS(doc,siteTemplate.getMatchContent());
if (contentWithTag == null || contentWithTag.trim().length() == 0) {
return docInfo;
}
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentWithTag));
docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
}
if(null!=siteTemplate.getMatchAuthor()&&siteTemplate.getMatchAuthor().length()>0) {
String author=paseElementByCSS(doc,siteTemplate.getMatchAuthor());
if(author.length()>0) {
docInfo.setAuthor(author);
}
}
if(null!=siteTemplate.getMatchPublishDate()&&siteTemplate.getMatchPublishDate().length()>0) {
String publishDate=paseElementByCSS(doc,siteTemplate.getMatchPublishDate());
if(publishDate.length()>0) {
docInfo.setPublishDate(DateUtil.getPublishDate(publishDate));
}
}
if(null!=siteTemplate.getMatchSummary()&&siteTemplate.getMatchSummary().length()>0) {
String summary=paseElementByCSS(doc,siteTemplate.getMatchSummary());
if(summary.length()>0) {
docInfo.setSummary(summary);
}
}
if(null!=siteTemplate.getMatchOrigin()&&siteTemplate.getMatchOrigin().length()>0) {
String origin=paseElementByCSS(doc,siteTemplate.getMatchOrigin());
if(origin.length()>0) {
docInfo.setOrigin(origin);
}
}
// this.buildProcessItem(docInfo);
return docInfo;
}
public static String paseElementByCSS(Document doc,String tag){
String msg="";
try {
Elements elements = doc.select(tag);
if (elements.size() > 0) {
msg = elements.get(0).text().trim();
}
}catch (Exception e){
e.printStackTrace();
}finally {
return msg;
}
// return msg;
}
}
package com.zzsn.search;
import com.zzsn.utility.index.Constants;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class BaiduSearch {
public static void main(String[] args) throws IOException {
// String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH;
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
BaiduSearchThread baiduSearchThread = new BaiduSearchThread();
paser(allLines);
}
public static void paser(List<String> keywords){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<BaiduSearchThread> workers = new Vector<BaiduSearchThread>();
int index = 0;
try {
for (List<String> keywordList : splitList) {
// BaiduSearchThread worker = new BaiduSearchThread();
BaiduSearchThread baiduSearchThread = new BaiduSearchThread();
baiduSearchThread.setThreadId(index++);
baiduSearchThread.setKeywords(keywordList);
workers.add(baiduSearchThread);
threadPool.execute(baiduSearchThread);
Thread.sleep(1000);
}
}catch (Exception e){
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
}
package com.zzsn.search;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSON;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.stereotype.Component;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Component
@Slf4j
public class CrawlerRun implements CommandLineRunner {
@Override
public void run(String... args) {
try {
consumerGroup();
}catch (Exception e){
consumerGroup();
}
// try {
// consumerPartition();
// }catch (Exception e){
// try {
// consumerPartition();
// }catch (Exception e2){
// consumerPartition();
// }
// }
// try {
// loadSiteFitler();
// }catch (Exception e){
// loadSiteFitler();
// }
// try {
// loadSiteMsgLoc();
// }catch (Exception e){
// loadSiteMsgLoc();
// }
}
public void consumerGroup (){
log.info("定时获取mq消息");
//1.创建消费者
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
log.info("进入定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
while(true){
try {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300);
for(ConsumerRecord record : records){
try {
consumer.commitSync();
KeywordMsg keywordMsg = JSON.parseObject(record.value().toString(), KeywordMsg.class);
log.info("关键词请求开始++++"+ DateTime.now());
if (keywordMsg.getSearchEngines().size() > 1) {
List<String> engines=keywordMsg.getSearchEngines();
if (!engines.contains("3")) {
continue;
}
} else {
continue;
}
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++"+DateTime.now());
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}catch (Exception e ){
log.info("kafka获取信息异常");
}
}
}
public void consumerPartition (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try {
Thread.sleep(30000);
} catch (InterruptedException e) {
e.printStackTrace();
}
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = JSON.parseObject(record.value().toString(), KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
consumer.commitSync();
if (keywordMsg.getSearchEngines().size() > 1) {
List<String> engines=keywordMsg.getSearchEngines();
if (!engines.contains("3")) {
continue;
}
} else {
continue;
}
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++");
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}
}
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
log.info("关键词请求开始++++"+DateTime.now());
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = JSON.parseObject(subvalue, KeywordMsg.class);
if(keywordMsg.getSearchEngines().size()>1){
String engines=keywordMsg.getSearchEngines().get(1);
if(!engines.contains("3")) {
continue;
}
}
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++"+DateTime.now());
}catch (Exception e){
e.printStackTrace();
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
public void loadloc(){
String key="{\n" +
" \"id\": \"2022090522\",\n" +
" \"wordsCode\": \"KW-20220602-0003\",\n" +
" \"wordsName\": \"2022世界机器人大会\",\n" +
" \"keyWord\": \"2022世界机器人大会\",\n" +
" \"exclusionWord\": null,\n" +
" \"status\": \"1\",\n" +
" \"subjectId\": null,\n" +
" \"subjectIds\": null,\n" +
" \"startTime\": null,\n" +
" \"endTime\": null \n" +
"}";
try {
KeywordMsg keywordMsg = JSON.parseObject(key, KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
e.printStackTrace();
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
package com.zzsn.search;
import java.io.*;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class FileUtil {
public static List<String> getFileLines(File file, String encoding)
throws IOException
{
List<String> lines = new ArrayList<String>();
if (encoding == null)
{
encoding = "utf-8";
}
FileInputStream stream = new FileInputStream(file);
InputStreamReader reader = new InputStreamReader(stream, encoding);
BufferedReader bufferedReader = new BufferedReader(reader);
String line = null;
while ((line = bufferedReader.readLine()) != null)
{
lines.add(line);
}
bufferedReader.close();
reader.close();
stream.close();
return lines;
}
//导出到csv文件
public static void array2CSV(ArrayList<ArrayList<String>> data, String path)
{
try {
BufferedWriter out =new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path),"UTF-8"));
for (int i = 0; i < data.size(); i++)
{
ArrayList<String> onerow=data.get(i);
for (int j = 0; j < onerow.size(); j++)
{
out.write(DelQuota(onerow.get(j)));
out.write("|");
}
out.newLine();
}
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static String DelQuota(String str)
{
String result = str;
// String[] strQuota = { "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "`", ";", "'", ",", ".", "/", ":", "/,", "<", ">", "?" };
String[] strQuota = { "|" };
for (int i = 0; i < strQuota.length; i++)
{
if (result.indexOf(strQuota[i]) > -1)
result = result.replace(strQuota[i], "");
}
return result;
}
public static void outMsg(String fileName, String content) {
File files = new File(fileName);
FileWriter writer = null;
try {
if (!files.exists()) {
files.createNewFile();
}
// 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
writer = new FileWriter(files, true);
writer.write(content);
writer.write("\n");
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer != null){
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static String getFileBody(File file, String encoding)
{
StringBuffer buffer = new StringBuffer();
try {
List<String> lines = getFileLines(file, encoding);
for (String line : lines)
{
buffer.append(line);
buffer.append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return buffer.toString();
}
public static String getFileBody(String path, String encoding)
{
File file = new File(path);
if (!file.exists())
{
return null;
}
return getFileBody(file, encoding);
}
public static void mergeFile(String dir, int depth,
String outPath, String encoding)
{
File dirFile = new File(dir);
List<File> filesForMerge = collectFiles(dirFile, depth, -1);
StringBuffer buffer = new StringBuffer();
for (File file : filesForMerge)
{
String temp = getFileBody(file, encoding);
buffer.append(temp);
buffer.append("\n");
}
try {
PrintWriter pw = new PrintWriter(outPath);
pw.write(buffer.toString());
pw.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static List<File> collectFiles(File dirFile, int depth, int sampleNum)
{
List<File> filesForMerge = new ArrayList<File>();
if (1 == depth)
{
File[] subFiles = dirFile.listFiles();
for (File file : subFiles)
{
if (file.isDirectory())
{
continue;
}
filesForMerge.add(file);
}
filesForMerge = sampleFiles(filesForMerge, sampleNum);
}
else
{
File[] subFiles = dirFile.listFiles();
for (File file : subFiles)
{
if (!file.isDirectory())
{
continue;
}
List<File> files = collectFiles(file, depth-1, sampleNum);
if (files.size() > 0)
{
filesForMerge.addAll(files);
}
}
}
return filesForMerge;
}
private static List<File> sampleFiles(List<File> origFiles, int sampleNum)
{
List<File> sampleFiles = new ArrayList<File>();
int totalSize = origFiles.size();
if (sampleNum >= 0 && totalSize > sampleNum)
{
if (sampleNum < totalSize / 10)
{
sampleNum = totalSize / 10;
}
Random random = new Random();
int count = 0;
while (count < sampleNum)
{
int pos = random.nextInt(totalSize);
File nextFile = origFiles.get(pos);
if (sampleFiles.contains(nextFile))
{
continue;
}
sampleFiles.add(nextFile);
count ++;
}
return sampleFiles;
}
return origFiles;
}
/**
* eg: cmread-books\dushi\16
* */
public static String getDifferPath(File subFile, File ancestorFile)
{
File parentFile = subFile.getParentFile();
String dirpath = "";
while (!parentFile.getPath().equalsIgnoreCase(ancestorFile.getPath()))
{
dirpath = String.format("%s\\%s", parentFile.getName(), dirpath);
parentFile = parentFile.getParentFile();
}
return dirpath;
}
public static void makeDir(String path)
{
File dirFile = new File(path);
if (!dirFile.exists())
{
dirFile.mkdirs();
}
}
/**
* find file/dir toFindFileName from ancestors of searchDirName
* */
public static File findDecestorFile(File searchDir, String toFindFileName)
{
if (searchDir.getName().equalsIgnoreCase(toFindFileName))
{
return searchDir;
}
if (searchDir.isDirectory())
{
File[] subFiles = searchDir.listFiles();
for (File subFile : subFiles)
{
File temp = findDecestorFile(subFile, toFindFileName);
if (temp != null)
{
return temp;
}
}
}
return null;
}
public static File findFile(String fileName, File parentFile, int compType)
{
File[] files = parentFile.listFiles();
switch (compType) {
case 0: //equal
for (File file : files)
{
if (file.getName().equalsIgnoreCase(fileName))
{
return file;
}
}
break;
case 1: //start with
for (File file : files)
{
if (file.getName().toLowerCase().startsWith(fileName.toLowerCase()))
{
return file;
}
}
break;
case 2: //contains
for (File file : files)
{
if (file.getName().toLowerCase().indexOf(fileName.toLowerCase()) != -1)
{
return file;
}
}
break;
default:
break;
}
return null;
}
private static final char BOM = '\uFEFF';
public static ArrayList<String> readTxtFile(String fileName,String encoding) {
ArrayList<String> rtn = new ArrayList<String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encoding));
String line = null;
boolean isFirstLine = true;
while ((line = reader.readLine()) != null) {
if (isFirstLine && line.length() > 0 && line.charAt(0) == BOM) {
line = line.substring(1);
isFirstLine = false;
}
rtn.add(line);
}
} catch (Exception e) {
String errMsg = String.format("Exception while reading file: %s", fileName, e.getMessage());
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
String errMsg = String.format("Exception while closing file [%s]: %s", fileName, e.getMessage());
e.printStackTrace();
}
}
}
return rtn;
}
public static void copyFile(File source, File dest) throws IOException {
FileChannel inputChannel = null;
FileChannel outputChannel = null;
try {
inputChannel = new FileInputStream(source).getChannel();
outputChannel = new FileOutputStream(dest).getChannel();
outputChannel.transferFrom(inputChannel, 0, inputChannel.size());
} finally {
inputChannel.close();
outputChannel.close();
}
}
public static List<String> findFiles(String ph) {
List<String> filesList = new ArrayList<String>();
File file = new File(ph);
if (!file.exists()) {
return null;
}
File tempFile;
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
tempFile = files[i];
if (tempFile.isFile()) {
filesList.add(tempFile.getName().toString());
}
}
return filesList;
}
}
package com.zzsn.search.db;
import java.sql.Connection;
import java.sql.SQLException;
/**
* 数据库连接层MYSQL
* @author Administrator
*
*/
public class DBConnection {
/**
* 连接数据库
* @return
*/
public static Connection getDBConnection()
{
// 1. 注册驱动
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 获取数据库的连接
try {
// Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://192.168.1.183:3306/clb_project?useUnicode=true&characterEncoding=utf8", "root", "root");
Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://114.115.159.144:3306/clb_project?useUnicode=true&characterEncoding=utf-8&useSSL=false", "root", "zzsn9988");
return conn;
} catch (SQLException e1) {
e1.printStackTrace();
}
return null;
}
}
package com.zzsn.search.db;
import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
/**
* MYSQL数据库底层封装
* @author Administrator
*
*/
public class DBManager {
private PreparedStatement pstmt;
private Connection conn;
private ResultSet rs;
/**
* 打开数据库
*/
public DBManager() {
conn = DBConnection.getDBConnection();
}
/**
* 执行修改添加操作
* @param coulmn
* @param type
* @param sql
* @return
* @throws SQLException
*/
public boolean updateOrAdd(String[] coulmn, int[] type, String sql) throws SQLException
{
if(!setPstmtParam(coulmn, type, sql))
return false;
boolean flag = pstmt.executeUpdate()>0?true:false;
closeDB();
return flag;
}
/**
* 获取查询结果集
* @param coulmn
* @param type
* @param sql
* @throws SQLException
*/
public DataTable getResultData(String[] coulmn, int[] type, String sql) throws SQLException
{
DataTable dt = new DataTable();
ArrayList<HashMap<String, String>>list = new ArrayList<HashMap<String, String>>();
if(!setPstmtParam(coulmn, type, sql))
return null;
rs = pstmt.executeQuery();
ResultSetMetaData rsmd = rs.getMetaData();//取数据库的列名
int numberOfColumns = rsmd.getColumnCount();
while(rs.next())
{
HashMap<String, String> rsTree = new HashMap<String, String>();
for(int r=1;r<numberOfColumns+1;r++)
{
rsTree.put(rsmd.getColumnName(r),rs.getObject(r).toString());
}
list.add(rsTree);
}
closeDB();
dt.setDataTable(list);
return dt;
}
/**
* 参数设置
* @param coulmn
* @param type
* @throws SQLException
* @throws NumberFormatException
*/
private boolean setPstmtParam(String[] coulmn, int[] type, String sql) throws NumberFormatException, SQLException
{
if(sql== null) return false;
pstmt = conn.prepareStatement(sql);
if(coulmn != null && type != null && coulmn.length !=0 && type.length !=0 )
{
for (int i = 0; i<type.length; i++) {
switch (type[i]) {
case Types.INTEGER:
pstmt.setInt(i+1, Integer.parseInt(coulmn[i]));
break;
case Types.BOOLEAN:
pstmt.setBoolean(i+1, Boolean.parseBoolean(coulmn[i]));
break;
case Types.CHAR:
pstmt.setString(i+1, coulmn[i]);
break;
case Types.DOUBLE:
pstmt.setDouble(i+1, Double.parseDouble(coulmn[i]));
break;
case Types.FLOAT:
pstmt.setFloat(i+1, Float.parseFloat(coulmn[i]));
break;
default:
break;
}
}
}
return true;
}
/**
* 关闭数据库
* @throws SQLException
*/
private void closeDB() throws SQLException
{
if(rs != null)
{
rs.close();
}
if(pstmt != null)
{
pstmt.close();
}
if(conn != null)
{
conn.close();
}
}
}
\ No newline at end of file
package com.zzsn.search.db;
import java.util.*;
/**
* 数据集封装
* @author Administrator
*/
public class DataTable {
public String[] column;//列字段
public String[][] row; //行值
public int rowCount = 0;//行数
public int colCoun = 0;//列数
public DataTable() {
super();
}
public DataTable(String[] column, String[][] row, int rowCount, int colCoun) {
super();
this.column = column;
this.row = row;
this.rowCount = rowCount;
this.colCoun = colCoun;
}
public void setDataTable(ArrayList<HashMap<String, String>> list) {
rowCount = list.size();
colCoun = list.get(0).size();
column = new String[colCoun];
row = new String[rowCount][colCoun];
for (int i = 0; i < rowCount; i++) {
Set<Map.Entry<String, String>> set = list.get(i).entrySet();
int j = 0;
for (Iterator<Map.Entry<String, String>> it = set.iterator(); it
.hasNext();) {
Map.Entry<String, String> entry = (Map.Entry<String, String>) it
.next();
row[i][j] = entry.getValue();
if (i == rowCount - 1) {
column[j] = entry.getKey();
}
j++;
}
}
}
public String[] getColumn() {
return column;
}
public void setColumn(String[] column) {
this.column = column;
}
public String[][] getRow() {
return row;
}
public void setRow(String[][] row) {
this.row = row;
}
public int getRowCount() {
return rowCount;
}
public void setRowCount(int rowCount) {
this.rowCount = rowCount;
}
public int getColCoun() {
return colCoun;
}
public void setColCoun(int colCoun) {
this.colCoun = colCoun;
}
}
\ No newline at end of file
package com.zzsn.search.db;
import lombok.extern.slf4j.Slf4j;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
/**
* @author xub
* @Description: 雪花算法
* @date 2019/8/14 下午8:22
*/
@Slf4j
public class SnowIdUtils {
/**
* 私有的 静态内部类
*/
private static class SnowFlake {
/**
* 内部类对象(单例模式)
*/
private static final SnowFlake SNOW_FLAKE = new SnowFlake();
/**
* 起始的时间戳
*/
private final long START_TIMESTAMP = 1557489395327L;
/**
* 序列号占用位数
*/
private final long SEQUENCE_BIT = 12;
/**
* 机器标识占用位数
*/
private final long MACHINE_BIT = 10;
/**
* 时间戳位移位数
*/
private final long TIMESTAMP_LEFT = SEQUENCE_BIT + MACHINE_BIT;
/**
* 最大序列号 (4095)
*/
private final long MAX_SEQUENCE = ~(-1L << SEQUENCE_BIT);
/**
* 最大机器编号 (1023)
*/
private final long MAX_MACHINE_ID = ~(-1L << MACHINE_BIT);
/**
* 生成id机器标识部分
*/
private long machineIdPart;
/**
* 序列号
*/
private long sequence = 0L;
/**
* 上一次时间戳
*/
private long lastStamp = -1L;
/**
* 构造函数初始化机器编码
*/
private SnowFlake() {
//模拟这里获得本机机器编码
long localIp = 4321;
//localIp & MAX_MACHINE_ID最大不会超过1023,在左位移12位
machineIdPart = (localIp & MAX_MACHINE_ID) << SEQUENCE_BIT;
}
/**
* 获取雪花ID
*/
public synchronized long nextId() {
long currentStamp = timeGen();
//避免机器时钟回拨
while (currentStamp < lastStamp) {
// //服务器时钟被调整了,ID生成器停止服务.
throw new RuntimeException(String.format("时钟已经回拨. Refusing to generate id for %d milliseconds", lastStamp - currentStamp));
}
if (currentStamp == lastStamp) {
// 每次+1
sequence = (sequence + 1) & MAX_SEQUENCE;
// 毫秒内序列溢出
if (sequence == 0) {
// 阻塞到下一个毫秒,获得新的时间戳
currentStamp = getNextMill();
}
} else {
//不同毫秒内,序列号置0
sequence = 0L;
}
lastStamp = currentStamp;
//时间戳部分+机器标识部分+序列号部分
return (currentStamp - START_TIMESTAMP) << TIMESTAMP_LEFT | machineIdPart | sequence;
}
/**
* 阻塞到下一个毫秒,直到获得新的时间戳
*/
private long getNextMill() {
long mill = timeGen();
//
while (mill <= lastStamp) {
mill = timeGen();
}
return mill;
}
/**
* 返回以毫秒为单位的当前时间
*/
protected long timeGen() {
return System.currentTimeMillis();
}
}
/**
* 获取long类型雪花ID
*/
public static long uniqueLong() {
return SnowFlake.SNOW_FLAKE.nextId();
}
/**
* 获取String类型雪花ID
*/
public static String uniqueLongHex() {
return String.format("%016x", uniqueLong());
}
/**
* 测试
*/
public static void main(String[] args) throws InterruptedException {
//计时开始时间
long start = System.currentTimeMillis();
//让100个线程同时进行
final CountDownLatch latch = new CountDownLatch(100);
//判断生成的20万条记录是否有重复记录
final Map<Long, Integer> map = new ConcurrentHashMap();
for (int i = 0; i < 100; i++) {
//创建100个线程
new Thread(() -> {
for (int s = 0; s < 2000; s++) {
long snowID = SnowIdUtils.uniqueLong();
log.info("生成雪花ID={}",snowID);
Integer put = map.put(snowID, 1);
if (put != null) {
throw new RuntimeException("主键重复");
}
}
latch.countDown();
}).start();
}
//让上面100个线程执行结束后,在走下面输出信息
latch.await();
log.info("生成20万条雪花ID总用时={}", System.currentTimeMillis() - start);
}
}
package com.zzsn.search.db;
import java.sql.SQLException;
import java.sql.Types;
/**
* 测试Demo
* @author Administrator
*/
public class TestBusIness{
static String searchSql = "select proxy from clb_proxy where ID= 1 ";
static String insertSql = "insert into score(name, age, score)values(?,?,?)";
static String deleteSql = "delete from score where id = ?";
static String updateSql = "update score set name = ? where id = ?";
static String searchSql2="select count(*) form from cis_ans_processitem where sid='202202150000' and ";
public static void main(String[] args) {
// intsertData();
searchData();
}
private static void intsertData()
{
DBManager dm = new DBManager();
String[] coulmn = new String[]{"wyf2", "23", "89.5"};
int[] type = new int[]{Types.CHAR, Types.INTEGER, Types.DOUBLE};
try {
boolean flag = dm.updateOrAdd(coulmn, type, insertSql);
if(flag)
System.out.println("插入成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
private static void searchData()
{
DBManager dm = new DBManager();
String[] coulmn = null;
int[] type = null;
try {
DataTable dt = dm.getResultData(coulmn, type, searchSql);
if(dt != null && dt.getRowCount()> 0){
for(int i = 0; i<dt.getRowCount(); i++)
{
for(int j = 0; j<dt.getColCoun(); j++)
System.out.printf(dt.getRow()[i][j]+"\t");
System.out.println();
}
}
else
System.out.println("查询失败");
} catch (SQLException e) {
e.printStackTrace();
}
}
}
package com.zzsn.search.entity;
import lombok.Data;
import java.util.List;
@Data
public class ClbAnsProcessitem {
/**主键*/
private String id;
/**信息源id*/
private String sid;
/**团队id*/
private String tid;
/**标题*/
private String title;
/**摘要*/
private String summary;
/**关键词*/
private String keyWords;
/**正文*/
private String content;
private String contentWithtag;
/**未知*/
private String hash;
/**作者*/
private String author;
/**来源*/
private String sourceSite;
/**地址*/
private String sourceAddress;
/**未知*/
private String currentProcess;
/**类别*/
private String type;
/**未知*/
private String withTagFile;
/**发布时间*/
private String publishDate;
/**创建人*/
private String createBy;
/**创建时间*/
private String createDate;
/**编码*/
private String charset;
/**未知*/
private Integer processResult;
/**最新更新时间*/
private String lastModified;
/**组织id*/
private String orgId;
/**词*/
private String words;
/**来源*/
private String origin;
/**未知*/
private String orientation;
/**来源*/
private String fromWhere;
/**来源id*/
private String fromId;
/**来源类别*/
private String sourceType;
/**未知*/
private String featureWords;
/**下载地址*/
private String fileDownloadPath;
private String contentImgCvtTag;
/**关联地址*/
private String relatePlaces;
/**关联人*/
private String relatePerson;
/**关联组织*/
private String relateOrg;
/**事件*/
private String relateEvent;
/**时间*/
private String relateDate;
/**未知*/
private Integer relevance1;
/**未知*/
private String relevance;
/**语言*/
private String lang;
/**组织*/
private String orgs;
//采集来源(如通用、定制、微信公众号等)
private String source;
/**(临时处理)关联的专题id*/
private List<String> subjectIds;
}
\ No newline at end of file
package com.zzsn.search.entity;
import lombok.Data;
import java.util.List;
@Data
public class KeywordMsg {
/**主键*/
private String id;
/**词组编码*/
private String wordsCode;
/**词组名称*/
private String wordsName;
/**关键词*/
private String keyWord;
/**排除词*/
private String exclusionWord;
/**状态*/
private Integer status;
/**数据库查询数据时承接*/
private String subjectId;
/**所属专题*/
private List<String> subjectIds;
private Long startTime;
// private List<String> startTime;
private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
}
package com.zzsn.search.extractor;
import com.zzsn.utility.model.FileTag;
import com.zzsn.utility.util.DateUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获取正文中的图片或者文件
* 创建人:李东亮
* 创建时间:2016-8-30 下午5:25:04
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ContentFileFinder {
/**
* 获取父路径
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String getDirPath(String path) {
path = path.substring(0, path.lastIndexOf("/")) ;
return path;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String formatPath(String currentPageURL,String imgPath) {
String start="";
if(currentPageURL.indexOf("http://")!=-1){
start = "http://";
}else if(currentPageURL.indexOf("https://")!=-1){
start = "https://";
}
//绝对路径
if(imgPath.startsWith("/")){
currentPageURL = currentPageURL.replace(start, "");
int subIndex = currentPageURL.indexOf("/");
if(subIndex==-1){
subIndex = currentPageURL.length();
}
String domain = currentPageURL.substring(0, subIndex);
return start+domain+imgPath;
}
//相对路径
String path = currentPageURL+"/"+imgPath;
path = path.replaceAll(start, "D:/");
File f = new File(path);
String filePath="";
try {
filePath = f.getCanonicalPath();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String result = filePath.replaceAll("D:\\\\", start);
result = result.replaceAll("\\\\", "/");
return result;
}
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private static String genImgFileName(String suffix){
String dir = DateUtil.format(new Date(), "yyyy-MM-dd");
String uuid = UUID.randomUUID().toString();
return dir+"/"+uuid+suffix;
}
/**
* 确保有src属性并且src属性指向正确的图片地址
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午1:46:03
* @version 1.0
* @param rawTag
* @return
*/
public static Element ensureSrc(Element imgTag){
// Document doc = Jsoup.parseBodyFragment(rawTag);
String firstSrcAtt=null;
if(imgTag.hasAttr("original")){
firstSrcAtt = "original";
}else if(imgTag.hasAttr("data-src")){
firstSrcAtt = "data-src";
}else if(imgTag.hasAttr("_src")){
firstSrcAtt = "_src";
}else if(imgTag.hasAttr("src")){
firstSrcAtt ="src";
}
if(firstSrcAtt==null){
return null;
}
imgTag.attr("src", imgTag.attr(firstSrcAtt));
return imgTag;
}
/**
* 获取图片的绝对路径
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午2:05:02
* @version 1.0
* @param element
* @param uri
* @return
*/
public static String getAbsolutePath(Element element,String uri,String linkAtt){
String absolutePath = element.attr(linkAtt);
if(absolutePath.startsWith("data:image")){
return null;
}
if(absolutePath.startsWith("file:")){
return null;
}
if (absolutePath.matches("(?i)^javascript.*|#")) {
return null;
}
if(absolutePath!=null && absolutePath.trim().length()>0 && !absolutePath.startsWith("http://")&&!absolutePath.startsWith("https://")&&uri!=null){
String puriDir = getDirPath(uri);
absolutePath = formatPath(puriDir,absolutePath);
}
return absolutePath;
}
/**
* 获取后缀名
* 创建人: 李东亮
* 创建时间: 2016-8-30 下午5:00:39
* @version 1.0
* @param uri
* @return
*/
public static String getSuffix(String uri){
uri = uri.replaceAll("http://|https://", "");
Pattern p = Pattern.compile("/.+(\\.\\w{1,4})$");
Matcher m = p.matcher(uri);
if(m.find()){
return m.group(1);
}
return "";
}
/**
* 获取正文中的文件标签,包含正文中的图片和附件
* 创建人: 李东亮
* 创建时间: 2016-9-8 下午3:01:09
* @version 1.0
* @param content
* @param sourceaddress
* @return
*/
public static Map<String,FileTag> getContentFileTag(String content,String sourceaddress){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml().replaceAll("\"", "'").replaceAll("'>", "' \\/>");
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = getAbsolutePath(imgTag,sourceaddress,filePathAttr);
if(absolutePath==null){
continue;
}
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
/**
* 得到网页中图片的地址
* @param sets html字符串
*/
public static Set<String> getImgStr(String htmlStr) {
Set<String> pics = new HashSet<String>();
String img = "";
Pattern p_image;
Matcher m_image;
String regEx_img = "<img.*src\\s*=\\s*(.*?)[^>]*?>";
p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while (m_image.find()) {
// 得到<img />数据
img = m_image.group();
System.out.println(img);
// 匹配<img>中的src数据
Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
while (m.find()) {
pics.add(m.group(1));
}
}
return pics;
}
public static String rmHtmlImgOrAtag(String htmlStr) {
//匹配img标签的正则表达式
String regxpForImgTag = "<img\\s[^>]+/>|<img.*>(.*?)</img>|<a.*>(.*?)</a>|<a.*/>";
Pattern pattern = Pattern.compile(regxpForImgTag);
Matcher matcher = pattern.matcher(htmlStr);
while (matcher.find()) {
String temp = matcher.group();
htmlStr = htmlStr.replace(temp, " ");
}
return htmlStr;
}
public static void main(String[] args) {
String str = "<span></span><img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\"点击图片查看幻灯模式\" original2=\"http://img3.laibafile.cn/p/l/246500759.jpg\" /> "
+ "<div>hello<div/><img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\\\"点击图片查看幻灯模式\\\" original2=\\\"http://img3.laibafile.cn/p/l/246500759.jpg\\\" >qew</img>";
// System.out.println(ContentFileFinder.getSuffix("http://www.baidu.com//a.xls"));
// getContentFileTag(str,"");
str=rmHtmlImgOrAtag(str);
System.out.println(str);
String aString=str+"<div class=\"_3L5YSq\" title=\"vue兼容ie11的解决方法 2021年最新解决方案\">"
+ "<a class=\"_1-HJSV _1OhGeD\" href=\"/p/4718213d9373\" target=\"_blank\" rel=\"noopener noreferrer\">vue兼容ie11的解决方法 2021年最新解决方案</a>"
+ "<span>qwqe<a href='qwe'/></span</div>";
aString=rmHtmlImgOrAtag(aString);
System.out.println(aString);
}
}
package com.zzsn.search.extractor;
/**
* 默认返回对象设置
* 创建人:李东亮
* 创建时间:2015-5-13 上午10:23:02
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DefaultMsg {
private int success = 1;
private String msg;
public DefaultMsg()
{
}
public DefaultMsg(int success)
{
this.success = success;
}
public int getSuccess() {
return success;
}
public void setSuccess(int success) {
this.success = success;
}
public String getMsg() {
return msg;
}
public void setMsg(String msg) {
this.msg = msg;
}
}
package com.zzsn.search.extractor;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.TimeoutException;
public class MemcachedFactory {
private static final Logger Log = LoggerFactory.getLogger(MemcachedFactory.class);
private static ApplicationContext factory;
protected static MemcachedClient client;
/**
* 加载spring容器
* 创建人: 李东亮 MemcachedFactory
* 创建时间: 2015-5-30 上午11:39:31
* @version 1.0
*/
public static void init(){
factory = new ClassPathXmlApplicationContext("conf/spring-memcached.xml");
client = (MemcachedClient )factory.getBean("memcachedClient");
}
/**
* 获取Key值
* @param key
* @return
*/
public static Object getKey(String key) {
try {
Object obj = client.get(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return null;
}
/**
* 设置缓存 10天
* @param key
* @return
*/
public static boolean setExpireskey(String key, Object value) {
boolean result = false;
try {
if(value==null){
value="null";
}
if (client.get(key) != null ) {
result = client.replace(key, 3600*24*10, value);
} else {
result = client.set(key, 3600*24*10, value);
}
} catch (TimeoutException | InterruptedException | MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKey(String key, Object value) {
boolean result = false;
try {
if(value==null){
value="null";
}
if (client.get(key) != null ) {
result = client.replace(key, 0, value);
} else {
result = client.set(key, 0, value);
}
} catch (TimeoutException | InterruptedException | MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static void setLset(String key,String value){
// client
}
public static void main(String[] args) throws UnsupportedEncodingException {
MemcachedFactory.init();
// DBServiceFactory.init();
Object object=MemcachedFactory.getKey("domainUri_www.ceconline.com");
System.out.println(object);
// List<Site> sites = DBServiceFactory.getCrawlerService().getSnSites();
/* //memcached清空缓存
Map<String,String> map = new HashMap<String,String>();
for(int i=0;i<1000000;i++){
map.put(i+"","adadadadadadadadadadadadadadader");
}
MemcachedFactory.setKey("aa", map);
Map<String,String> m = (Map<String, String>) MemcachedFactory.getKey("aa");
System.out.println(m.size());*/
//MemcachedFactory.flushAll();
}
}
package com.zzsn.search.oracledb;
import com.zzsn.utility.index.Constants;
import java.sql.Connection;
import java.sql.SQLException;
/**
* 数据库连接层MYSQL
* @author Administrator
*
*/
public class OracleDBConnection {
/**
* 连接数据库
* @return
*/
public static Connection getDBConnection()
{
// 1. 注册驱动
try {
Class.forName("oracle.jdbc.driver.OracleDriver");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 获取数据库的连接
try {
Connection conn = java.sql.DriverManager.getConnection("jdbc:oracle:thin:@114.116.91.1:1521:orcl", "cis", "ZZsn9988_1qaz");
return conn;
} catch (SQLException e1) {
e1.printStackTrace();
}
return null;
}
}
package com.zzsn.search.oracledb;
import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
/**
* MYSQL数据库底层封装
* @author Administrator
*
*/
public class OracleDBManager {
private PreparedStatement pstmt;
private Connection conn;
private ResultSet rs;
/**
* 打开数据库
*/
public OracleDBManager() {
conn = OracleDBConnection.getDBConnection();
}
/**
* 执行修改添加操作
* @param coulmn
* @param type
* @param sql
* @return
* @throws SQLException
*/
public boolean updateOrAdd(String[] coulmn, int[] type, String sql) throws SQLException
{
if(!setPstmtParam(coulmn, type, sql))
return false;
boolean flag = pstmt.executeUpdate()>0?true:false;
closeDB();
return flag;
}
/**
* 获取查询结果集
* @param coulmn
* @param type
* @param sql
* @throws SQLException
*/
public OracleDataTable getResultData(String[] coulmn, int[] type, String sql) throws SQLException
{
OracleDataTable dt = new OracleDataTable();
ArrayList<HashMap<String, String>>list = new ArrayList<HashMap<String, String>>();
if(!setPstmtParam(coulmn, type, sql))
return null;
rs = pstmt.executeQuery();
ResultSetMetaData rsmd = rs.getMetaData();//取数据库的列名
int numberOfColumns = rsmd.getColumnCount();
while(rs.next())
{
HashMap<String, String> rsTree = new HashMap<String, String>();
for(int r=1;r<numberOfColumns+1;r++)
{
rsTree.put(rsmd.getColumnName(r),rs.getObject(r).toString());
}
list.add(rsTree);
}
closeDB();
dt.setDataTable(list);
return dt;
}
/**
* 参数设置
* @param coulmn
* @param type
* @throws SQLException
* @throws NumberFormatException
*/
private boolean setPstmtParam(String[] coulmn, int[] type, String sql) throws NumberFormatException, SQLException
{
if(sql== null) return false;
pstmt = conn.prepareStatement(sql);
if(coulmn != null && type != null && coulmn.length !=0 && type.length !=0 )
{
for (int i = 0; i<type.length; i++) {
switch (type[i]) {
case Types.INTEGER:
pstmt.setInt(i+1, Integer.parseInt(coulmn[i]));
break;
case Types.BOOLEAN:
pstmt.setBoolean(i+1, Boolean.parseBoolean(coulmn[i]));
break;
case Types.CHAR:
pstmt.setString(i+1, coulmn[i]);
break;
case Types.DOUBLE:
pstmt.setDouble(i+1, Double.parseDouble(coulmn[i]));
break;
case Types.FLOAT:
pstmt.setFloat(i+1, Float.parseFloat(coulmn[i]));
break;
default:
break;
}
}
}
return true;
}
/**
* 关闭数据库
* @throws SQLException
*/
private void closeDB() throws SQLException
{
if(rs != null)
{
rs.close();
}
if(pstmt != null)
{
pstmt.close();
}
if(conn != null)
{
conn.close();
}
}
}
\ No newline at end of file
package com.zzsn.search.oracledb;
import java.util.*;
/**
* 数据集封装
* @author Administrator
*/
public class OracleDataTable {
public String[] column;//列字段
public String[][] row; //行值
public int rowCount = 0;//行数
public int colCoun = 0;//列数
public OracleDataTable() {
super();
}
public OracleDataTable(String[] column, String[][] row, int rowCount, int colCoun) {
super();
this.column = column;
this.row = row;
this.rowCount = rowCount;
this.colCoun = colCoun;
}
public void setDataTable(ArrayList<HashMap<String, String>> list) {
rowCount = list.size();
colCoun = list.get(0).size();
column = new String[colCoun];
row = new String[rowCount][colCoun];
for (int i = 0; i < rowCount; i++) {
Set<Map.Entry<String, String>> set = list.get(i).entrySet();
int j = 0;
for (Iterator<Map.Entry<String, String>> it = set.iterator(); it
.hasNext();) {
Map.Entry<String, String> entry = (Map.Entry<String, String>) it
.next();
row[i][j] = entry.getValue();
if (i == rowCount - 1) {
column[j] = entry.getKey();
}
j++;
}
}
}
public String[] getColumn() {
return column;
}
public void setColumn(String[] column) {
this.column = column;
}
public String[][] getRow() {
return row;
}
public void setRow(String[][] row) {
this.row = row;
}
public int getRowCount() {
return rowCount;
}
public void setRowCount(int rowCount) {
this.rowCount = rowCount;
}
public int getColCoun() {
return colCoun;
}
public void setColCoun(int colCoun) {
this.colCoun = colCoun;
}
}
\ No newline at end of file
package com.zzsn.search.oracledb;
import lombok.extern.slf4j.Slf4j;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
/**
* @author xub
* @Description: 雪花算法
* @date 2019/8/14 下午8:22
*/
@Slf4j
public class SnowIdUtils {
/**
* 私有的 静态内部类
*/
private static class SnowFlake {
/**
* 内部类对象(单例模式)
*/
private static final SnowFlake SNOW_FLAKE = new SnowFlake();
/**
* 起始的时间戳
*/
private final long START_TIMESTAMP = 1557489395327L;
/**
* 序列号占用位数
*/
private final long SEQUENCE_BIT = 12;
/**
* 机器标识占用位数
*/
private final long MACHINE_BIT = 10;
/**
* 时间戳位移位数
*/
private final long TIMESTAMP_LEFT = SEQUENCE_BIT + MACHINE_BIT;
/**
* 最大序列号 (4095)
*/
private final long MAX_SEQUENCE = ~(-1L << SEQUENCE_BIT);
/**
* 最大机器编号 (1023)
*/
private final long MAX_MACHINE_ID = ~(-1L << MACHINE_BIT);
/**
* 生成id机器标识部分
*/
private long machineIdPart;
/**
* 序列号
*/
private long sequence = 0L;
/**
* 上一次时间戳
*/
private long lastStamp = -1L;
/**
* 构造函数初始化机器编码
*/
private SnowFlake() {
//模拟这里获得本机机器编码
long localIp = 4321;
//localIp & MAX_MACHINE_ID最大不会超过1023,在左位移12位
machineIdPart = (localIp & MAX_MACHINE_ID) << SEQUENCE_BIT;
}
/**
* 获取雪花ID
*/
public synchronized long nextId() {
long currentStamp = timeGen();
//避免机器时钟回拨
while (currentStamp < lastStamp) {
// //服务器时钟被调整了,ID生成器停止服务.
throw new RuntimeException(String.format("时钟已经回拨. Refusing to generate id for %d milliseconds", lastStamp - currentStamp));
}
if (currentStamp == lastStamp) {
// 每次+1
sequence = (sequence + 1) & MAX_SEQUENCE;
// 毫秒内序列溢出
if (sequence == 0) {
// 阻塞到下一个毫秒,获得新的时间戳
currentStamp = getNextMill();
}
} else {
//不同毫秒内,序列号置0
sequence = 0L;
}
lastStamp = currentStamp;
//时间戳部分+机器标识部分+序列号部分
return (currentStamp - START_TIMESTAMP) << TIMESTAMP_LEFT | machineIdPart | sequence;
}
/**
* 阻塞到下一个毫秒,直到获得新的时间戳
*/
private long getNextMill() {
long mill = timeGen();
//
while (mill <= lastStamp) {
mill = timeGen();
}
return mill;
}
/**
* 返回以毫秒为单位的当前时间
*/
protected long timeGen() {
return System.currentTimeMillis();
}
}
/**
* 获取long类型雪花ID
*/
public static long uniqueLong() {
return SnowFlake.SNOW_FLAKE.nextId();
}
/**
* 获取String类型雪花ID
*/
public static String uniqueLongHex() {
return String.format("%016x", uniqueLong());
}
/**
* 测试
*/
public static void main(String[] args) throws InterruptedException {
//计时开始时间
long start = System.currentTimeMillis();
//让100个线程同时进行
final CountDownLatch latch = new CountDownLatch(100);
//判断生成的20万条记录是否有重复记录
final Map<Long, Integer> map = new ConcurrentHashMap();
for (int i = 0; i < 100; i++) {
//创建100个线程
new Thread(() -> {
for (int s = 0; s < 2000; s++) {
long snowID = SnowIdUtils.uniqueLong();
log.info("生成雪花ID={}",snowID);
Integer put = map.put(snowID, 1);
if (put != null) {
throw new RuntimeException("主键重复");
}
}
latch.countDown();
}).start();
}
//让上面100个线程执行结束后,在走下面输出信息
latch.await();
log.info("生成20万条雪花ID总用时={}", System.currentTimeMillis() - start);
}
}
package com.zzsn.search.oracledb;
import java.sql.SQLException;
import java.sql.Types;
/**
* 测试Demo
* @author Administrator
*/
public class TestBusIness{
static String searchSql = "select proxy from CIS_sys_Proxy where ID = 1";
static String insertSql = "insert into score(name, age, score)values(?,?,?)";
static String deleteSql = "delete from score where id = ?";
static String updateSql = "update score set name = ? where id = ?";
static String searchSql2="select count(*) form from cis_ans_processitem where sid='202202150000' and ";
public static void main(String[] args) {
// intsertData();
searchData();
}
private static void intsertData()
{
OracleDBManager dm = new OracleDBManager();
String[] coulmn = new String[]{"wyf2", "23", "89.5"};
int[] type = new int[]{Types.CHAR, Types.INTEGER, Types.DOUBLE};
try {
boolean flag = dm.updateOrAdd(coulmn, type, insertSql);
if(flag)
System.out.println("插入成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
private static void searchData()
{
OracleDBManager dm = new OracleDBManager();
String[] coulmn = null;
int[] type = null;
try {
OracleDataTable dt = dm.getResultData(coulmn, type, searchSql);
if(dt != null && dt.getRowCount()> 0){
for(int i = 0; i<dt.getRowCount(); i++)
{
for(int j = 0; j<dt.getColCoun(); j++)
System.out.printf(dt.getRow()[i][j]+"\t");
System.out.println();
}
}
else
System.out.println("查询失败");
} catch (SQLException e) {
e.printStackTrace();
}
}
}
package com.zzsn.search.util;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* HttpClient库使用代理
*/
public class HttpClientProxy {
String proxyHost;
int proxyPort;
String proxyAccount;
String proxyPwd;
/**
* @param host 代理主机地址
* @param port 代理主机端口
*/
public static HttpClientProxy build(String host, int port) {
HttpClientProxy proxy = new HttpClientProxy();
proxy.proxyHost = host;
proxy.proxyPort = port;
return proxy;
}
/**
* @param host 代理主机地址
* @param port 代理主机端口
* @param acc 代理认证账号
* @param pwd 代理认证口令
*/
public static HttpClientProxy build(String host, int port, String acc, String pwd) {
HttpClientProxy proxy = new HttpClientProxy();
proxy.proxyHost = host;
proxy.proxyPort = port;
proxy.proxyAccount = acc;
proxy.proxyPwd = pwd;
return proxy;
}
public void test() throws IOException {
String targetUrl = "http://myip.ipip.net";
CloseableHttpClient client = null;
if (proxyAccount == null || proxyPwd == null) {
client = getHttpClient(proxyHost, proxyPort);
} else {
// 账号密码验证
client = getHttpClient(proxyHost, proxyPort, proxyAccount, proxyPwd);
}
HttpGet httpGet = new HttpGet(targetUrl);
CloseableHttpResponse response = client.execute(httpGet);
String resultStr = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
System.out.println(resultStr);
}
public String requestUrl(String url) throws IOException {
String targetUrl = url;
CloseableHttpClient client = null;
if (proxyAccount == null || proxyPwd == null) {
client = getHttpClient(proxyHost, proxyPort);
} else {
// 账号密码验证
client = getHttpClient(proxyHost, proxyPort, proxyAccount, proxyPwd);
}
HttpGet httpGet = new HttpGet(targetUrl);
httpGet.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpGet.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
// 伪装成浏览器
httpGet.setHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.setHeader(HttpHeaders.CONNECTION, "close");
CloseableHttpResponse response = client.execute(httpGet);
String resultStr = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
// System.out.println(resultStr);
return resultStr;
}
/**
* 代理不需要账号密码认证的httpClient
*/
private static CloseableHttpClient getHttpClient(String proxyHost, int proxyPort) {
HttpHost proxy = new HttpHost(proxyHost, proxyPort, "HTTP");
return HttpClients.custom()
.setProxy(proxy)
.build();
}
/**
* 代理需要账号密码认证的httpClient
*/
private static CloseableHttpClient getHttpClient(String proxyHost, int proxyPort, String acc, String pwd) {
HttpHost proxy = new HttpHost(proxyHost, proxyPort, "HTTP");
CredentialsProvider provider = new BasicCredentialsProvider();
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(acc, pwd));
return HttpClients.custom()
.setProxy(proxy)
.setDefaultCredentialsProvider(provider)
.build();
}
}
\ No newline at end of file
package com.zzsn.search.util;
import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.db.DBManager;
import com.zzsn.search.db.DataTable;
import com.zzsn.search.oracledb.OracleDBManager;
import com.zzsn.search.oracledb.OracleDataTable;
import org.springframework.beans.factory.annotation.Autowired;
import java.sql.SQLException;
public class ProxyIp {
public String getProxyIP(){
String sql = "select proxy from clb_proxy where ID = "+ RandomUtil.randomInt(1,5);
String proxy="";
DBManager dm = new DBManager();
String[] coulmn = null;
int[] type = null;
try {
DataTable dt = dm.getResultData(coulmn, type, sql);
if(dt != null && dt.getRowCount()> 0){
for(int i = 0; i<dt.getRowCount(); i++)
{
for(int j = 0; j<dt.getColCoun(); j++) {
proxy=dt.getRow()[i][j];
}
}
}
else
System.out.println("查询失败");
} catch (SQLException e) {
e.printStackTrace();
}
return proxy;
}
// @Autowired
// private JdbcTemplate jdbcTemplate=SpringContextUtil.getBean(JdbcTemplate.class);
//
// public String getProxyIP(){
//
// String sql = "select proxy from CIS_sys_Proxy where ID = "+ RandomUtil.randomInt(1,5);
// String proxy="";
// try {
// proxy = jdbcTemplate.queryForObject(sql, String.class);
// }catch (Exception e){
// try {
// Thread.sleep(3000);
// proxy = jdbcTemplate.queryForObject(sql, String.class);
// }catch (Exception ee){
// e.getMessage();
// }
// }
// return proxy;
// }
}
package com.zzsn.search.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SplitKeyword {
//关键词处理
public static List<String> transForm(String keyWords) {
if (keyWords == null) {
return null;
}
//(*备注:或:用“,”号表示,与:用“+”号表示,非:用“-”号表示)
keyWords = keyWords.replaceAll("[,,;;]", "|");
keyWords = keyWords.replace("(", "(");
keyWords = keyWords.replace(")", ")");
List<String> keyWordsList = new ArrayList<>();
if (!keyWords.contains("+")) { //不含有+ 形式:A|B|C
keyWords = keyWords.replace("(", "");
keyWords = keyWords.replace(")", "");
keyWordsList = Arrays.asList(keyWords.split("\\|"));
} else if (keyWords.contains(")+(")) { //含有+ 形式:(A|B)+(C|D)+(E|F)
String[] cronArray = keyWords.split("\\+");
List<List<String>> list = new ArrayList<>();
for (int i = 0; i < cronArray.length; i++) {
cronArray[i] = cronArray[i].replace("(", "");
cronArray[i] = cronArray[i].replace(")", "");
cronArray[i] = cronArray[i].replace("|", ",");
list.add(Arrays.asList(cronArray[i].split(",")));
}
List<String> stringList = getPermutations(list);
for (String str : stringList) {
keyWordsList.add(str.substring(0, str.length() - 1));
}
}else if (keyWords.contains(")+")) { //含有+ 形式:(A|B)+(C|D)+(E|F)
String[] cronArray = keyWords.split("\\+");
List<List<String>> list = new ArrayList<>();
for (int i = 0; i < cronArray.length; i++) {
cronArray[i] = cronArray[i].replace("(", "");
cronArray[i] = cronArray[i].replace(")", "");
cronArray[i] = cronArray[i].replace("|", ",");
list.add(Arrays.asList(cronArray[i].split(",")));
}
List<String> stringList = getPermutations(list);
for (String str : stringList) {
keyWordsList.add(str.substring(0, str.length() - 1));
}
}
return keyWordsList;
}
/**
* 多个数组排列组合
* @param list 原始list
* @param <T> 数据类型
* @return
*/
private static <T> List<List<T>> getDescartes(List<List<T>> list) {
List<List<T>> returnList = new ArrayList<>();
descartesRecursive(list, 0, returnList, new ArrayList<T>());
return returnList;
}
/**
* 递归实现
* 原理:从原始list的0开始依次遍历到最后
*
* @param originalList 原始list
* @param position 当前递归在原始list的position
* @param returnList 返回结果
* @param cacheList 临时保存的list
*/
private static <T> void descartesRecursive(List<List<T>> originalList, int position, List<List<T>> returnList, List<T> cacheList) {
List<T> originalItemList = originalList.get(position);
for (int i = 0; i < originalItemList.size(); i++) {
//最后一个复用cacheList,节省内存
List<T> childCacheList = (i == originalItemList.size() - 1) ? cacheList : new ArrayList<>(cacheList);
childCacheList.add(originalItemList.get(i));
if (position == originalList.size() - 1) {//遍历到最后退出递归
returnList.add(childCacheList);
continue;
}
descartesRecursive(originalList, position + 1, returnList, childCacheList);
}
}
/**
* 多个数组排列组合
* @param list 原始list
* @return
*/
private static List<String> getPermutations(List<List<String>> list) {
List<String> resultList = new ArrayList<>();
List<List<String>> list1 = getDescartes(list);
if(list1.size()>0){
list1.forEach(temp->{
String str = listToStr(temp);
if (!org.springframework.util.StringUtils.isEmpty(str)) {
resultList.add(listToStr(temp));
}
});
}
return resultList;
}
private static String listToStr(List<String> list){
StringBuffer str = new StringBuffer();
if(list.size()>0){
list.forEach(temp->{
str.append(temp).append("+");
});
}
return str.toString();
}
public static void main(String[] args) {
String kwords="(国家能源投资集团有限责任公司|国家能源集团|国家能源投资集团|中国中煤能源集团有限公司|中煤能源集团|中煤集团|国家开发投资集团有限公司|国投|华润(集团)有限公司|华润|华润集团|中国华能集团有限公司|中国华能|华能集团|国家电力投资集团有限公司|国家电投|中国华电集团有限公司|中国华电|中国华电集团公司|中国大唐集团有限公司|中国大唐集团)+煤炭+(采购|销售|中长期合同|电力|保供|哄抬|倒卖)";
List<String> strings = transForm(kwords);
for (String key :strings) {
System.out.println(key);
}
}
}
package com.zzsn.search.util;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
import java.util.Map;
/**
* 获取Spring的ApplicationContext对象工具,可以用静态方法的方式获取spring容器中的bean
* @author https://blog.csdn.net/chen_2890
* @date 2019/6/26 16:20
*/
@Component
public class SpringContextUtil implements ApplicationContextAware {
private static ApplicationContext applicationContext;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
SpringContextUtil.applicationContext = applicationContext;
}
/**
* 获取applicationContext
*/
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
/**
* 通过name获取 Bean.
*/
public static Object getBean(String name) {
Object o = null;
try {
o = getApplicationContext().getBean(name);
} catch (NoSuchBeanDefinitionException e) {
// e.printStackTrace();
}
return o;
}
/**
* 通过class获取Bean.
*/
public static <T> T getBean(Class<T> clazz) {
return getApplicationContext().getBean(clazz);
}
/**
* 通过name,以及Clazz返回指定的Bean
*/
public static <T> T getBean(String name, Class<T> clazz) {
return getApplicationContext().getBean(name, clazz);
}
/**
* 通过name获取 Bean.
*/
public static <T> Map<String, T> getBeansOfType(Class<T> clazz) {
return getApplicationContext().getBeansOfType(clazz);
}
/**
* 获取配置文件配置项的值
*
* @param key 配置项key
*/
public static String getEnvironmentProperty(String key) {
return getApplicationContext().getEnvironment().getProperty(key);
}
/**
* 获取spring.profiles.active
*/
public static String getActiveProfile() {
return getApplicationContext().getEnvironment().getActiveProfiles()[0];
}
}
package com.zzsn.test;
import cn.hutool.core.util.RandomUtil;
import com.google.gson.Gson;
import com.zzsn.search.entity.KeywordMsg;
public class KeyTest {
public static void main(String[] args) {
System.out.println(RandomUtil.randomInt(5));
String value="{\"@class\":\"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\"," +
"\"id\":\"1532331241232039937\",\"wordsCode\":\"KW-20220602-0003\"," +
"\"wordsName\":\"人工智能应用\"," +
"\"keyWord\":\"(人工智能|人工智能应用|应用|人工智能技术|人工智能领域|人工智能系统|人工智能产品|智能汽车|无人驾驶|人脸识别|人像识别|面部识别|机器翻译|自然语言处理|声纹识别|智能客服|智能音箱|语音识别|语音合成|个性化推荐|图像识别|图像搜索|人工智能应用|大数据分析|大数据|人工智能设计|人机交互|人工智能方案|人工智能解决方案|人工智能实验室|人工智能模型|人工智能问题|人工智能流程|人工智能设备|生成式对抗网络|计算智能|感知智能|认知智能|机器学习|增强学习|结构化数据|非结构化数据|传感器|理解能力|归纳能力|推理能力|特征提取|模式分析|预测|智能农业|智能工业|智能工厂|工业机器人|智能手机|无人驾驶汽车|无人机|智能机器人|环境感知|路径规划|行为决策|算法|智能分拣|设备健康管理|表面缺陷检测|智能决策|数字孪生|创成式设计|需求预测|供应链优化|深度学习|Applications of artificial intelligence|artificial intelligence|Applications|AI|Driverless Car|Automatic Speech Recognition|ASR|Natural Language Processing|NLP|Text To Speech|TTS|GAN| generative adversarial network|SLAM|simultaneous localization and mapping|Generative Design|AI Application in E-Commerce|Personalized Shopping|AI-powered Assistants|Fraud Prevention| Applications Of Artificial Intelligence in Education|Administrative Tasks Automated to Aid Educators|Administrative Tasks Automated to Aid Educators|Creating Smart Content|Voice Assistants|Personalized Learning|Applications of Artificial Intelligence in Lifestyle|Autonomous Vehicles|Spam Filters|Facial Recognition|Recommendation System|Applications of Artificial intelligence in Navigation|Applications of Artificial Intelligence in Robotics|Applications of Artificial Intelligence in Human Resource|Applications of Artificial Intelligence in Healthcare|Applications of Artificial Intelligence in Agriculture|Applications of Artificial Intelligence in Gaming|Applications of Artificial Intelligence in Automobiles|Applications of Artificial Intelligence in Social Media|Applications of Artificial Intelligence in Marketing| Applications of Artificial Intelligence in Chatbots|Applications of of Artificial Intelligence in Finance)+(人工智能|artificial intelligence|AI)\\n\"," +
"\"exclusionWord\":null,\"status\":\"1\",\"subjectId\":null," +
"\"subjectIds\":null,\"startTime\":[\"java.util.Date\",1640966400000],\"endTime\":null," +
"\"searchEngines\":[\"java.util.ArrayList\",[\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"10\"]],\"crawlerType\":\"1\"}";
System.out.println(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")));
String aa=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
// System.out.println(aa);
KeywordMsg keywordMsg = new Gson().fromJson(value, KeywordMsg.class);
// System.out.println(keywordMsg);
}
}
package com.zzsn.test;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class TimePaser {
public static void main(String[] args) throws Exception{
// String aa="2022-04-18";
// String s = dateToStamp(aa);
// System.out.println(s);
Date date = new Date();
String nowTime="";
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
// String format = simpleDateFormat.format("1650384000");
System.out.println(stampToTime("1650384000"));
}
public static String dateToStamp(String s) throws ParseException {
String res;
//设置时间模版
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
Date date = simpleDateFormat.parse(s);
long ts = date.getTime()/1000;
res = String.valueOf(ts);
return res;
}
//将时间戳转换为时间
public static String stampToTime(String s) throws Exception{
String res;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
long lt = new Long(s);
//将时间戳转换为时间
Date date = new Date(lt*1000);
//将时间调整为yyyy-MM-dd HH:mm:ss时间样式
res = simpleDateFormat.format(date);
return res;
}
}
package com.zzsn.utility.index;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
/**
* 系统变量
* 创建人:李东亮
* 创建时间:2015-5-12 上午11:52:07
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Slf4j
public class Constants {
private static Properties prop = getConfig();
// private static Properties prop = new Properties();
// static {
// try {
// prop.load(Constants.class.getResourceAsStream("constants.properties"));
// } catch (IOException e) {
// // TODO Auto-generated catch block
// log.error("错误", e);
// e.printStackTrace();
// }
// }
public static Properties getConfig() {
Properties properties = new Properties();
InputStream is = null;
String location = "constants.properties";
try {
Resource resource = new DefaultResourceLoader().getResource(location);
is = resource.getInputStream();
properties.load(is);
log.debug("constants config: {}", properties.toString());
} catch (IOException ex) {
log.error("Could not load property file:" + location, ex);
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException ioe) {
// ignore
}
}
return properties;
}
//爬取到的网页类型
public static final String TYPE_HTML = "HTML";
public static final String TYPE_EXCEL = "EXCEL";
public static final String TYPE_WORD = "WORD";
public static final String TYPE_PPT = "PPT";
public static final String TYPE_PDF = "PDF";
public static final String TYPE_IMG = "IMG";
public static final String DEFAULT_LANG="cn";
public static final String WORK_URL = prop.getProperty("WORK_URL");
public static final String LOCAL_URL = prop.getProperty("LOCAL_URL");
public static final String ORGS = prop.getProperty("ORGS");
public static final String TID = prop.getProperty("TID");
public static final Integer PROXYFLAG = Integer.parseInt(prop.getProperty("PROXY"));
public static final long PROXYID = Long.parseLong(prop.getProperty("PROXYID"));
public static final String CHROMEDRIVE = prop.getProperty("CHROMEDRIVE");
public static final String CHROMEBIN = prop.getProperty("CHROMEBIN");
public static final String KAFKA_CONSUMER_SERVERS= prop.getProperty("KAFKA_CONSUMER_SERVERS");
public static final String KAFKA_CONSUMER_TOPIC= prop.getProperty("KAFKA_CONSUMER_TOPIC");
public static final String KAFKA_CONSUMER_GROUP_ID= prop.getProperty("KAFKA_CONSUMER_GROUP_ID");
public static final String KAFKA_CONSUMER_AUTO_OFFSET_RESET= prop.getProperty("KAFKA_CONSUMER_AUTO_OFFSET_RESET");
public static final String PROXY= prop.getProperty("PROXY");
public static final String KAFKA_PRODUCT_TOPIC= prop.getProperty("KAFKA_PRODUCT_TOPIC");
public static final String KAFKA_PRODUCT_URLLIST_TOPIC= prop.getProperty("KAFKA_PRODUCT_URLLIST_TOPIC");
public static final String KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC= prop.getProperty("KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC");
// public static final String KAFKA_PRODUCT_PASERURL_TOPIC= prop.getProperty("KAFKA_PRODUCT_PASERURL_TOPIC");
public static final String SOURCEADDRESS="SOURCEADDRESS_";
public static final String META_SEARCH_URL= prop.getProperty("META_SEARCH_URL");
public static final String META_SEARCH_KEYWORDPATH= prop.getProperty("META_SEARCH_KEYWORDPATH");
// redis
public static final String REDIS_LOCALHOST= prop.getProperty("redis.host");
public static final String REDIS_PORT= prop.getProperty("redis.port");
public static final String REDIS_PASS= prop.getProperty("redis.pass");
public static final String REDIS_TIMEOUT= prop.getProperty("redis.timeout");
public static final String REDIS_MAXIDLE= prop.getProperty("redis.maxIdle");
public static final String REDIS_MAXTOTAL= prop.getProperty("redis.maxTotal");
public static final String REDIS_MAXWAITMILLIS= prop.getProperty("redis.maxWaitMillis");
public static final String REDIS_TESTONBORROW= prop.getProperty("redis.testOnBorrow");
public static final Integer THREAD_SIZE =Integer.valueOf(prop.getProperty("THREAD_SIZE"));
public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION");
public static final String KAFKA_PRODUCT_PARTITION= prop.getProperty("KAFKA_PRODUCT_PARTITION");
public static final String CRAWLER_SERVER= prop.getProperty("crawler_server");
// public static final String ORACLE_URL= prop.getProperty("oracle_url");
// public static final String ORACLE_USERNAME= prop.getProperty("oracle_username");
// public static final String ORACLE_PASSWORD= prop.getProperty("oracle_password");
// public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
//
// public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC");
//
// public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize"));
//
// public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger"));
// public static final String WEBBEAN_MAXCONNECTIONS= prop.getProperty("WEBBEAN_MAXCONNECTIONS");
// public static final String WEBBEAN_MINCONNECTIONS= prop.getProperty("WEBBEAN_MINCONNECTIONS");
// public static final String WEBBEAN_INITCONNECTIONS= prop.getProperty("WEBBEAN_INITCONNECTIONS");
// public static final String WEBBEAN_CONNINTERVAL= prop.getProperty("WEBBEAN_CONNINTERVAL");
// public static final String WEBBEAN_TIMEOUT= prop.getProperty("WEBBEAN_TIMEOUT");
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论