package com.walker.wxtools.spider;
|
|
import com.walker.infrastructure.utils.KeyValue;
|
import com.walker.infrastructure.utils.StringUtils;
|
import com.walker.spider.KeyValueParse;
|
import com.walker.wxtools.AccountArticle;
|
import org.slf4j.Logger;
|
import org.slf4j.LoggerFactory;
|
import org.springframework.web.client.RestTemplate;
|
|
import java.util.ArrayList;
|
import java.util.Date;
|
import java.util.List;
|
import java.util.Random;
|
import java.util.concurrent.TimeUnit;
|
|
/**
|
* 模拟采集定时任务,不会终止。
|
* <p>通过直接设置登录参数,可以实现持续不断采集。</p>
|
* <p>该对象是一个模拟程序,但也可以直接使用,它从给定的参数,获取公众号文章列表集合。<br>
|
* 具体用法,请见对象:{@linkplain AccountArticleSpider}
|
* </p>
|
* @author 时克英
|
* @date 2023-04-27
|
*/
|
public class MockSpiderScheduler {
|
|
protected final transient Logger logger = LoggerFactory.getLogger(getClass());
|
|
private int responseErrorCount = 0;
|
|
private static final long sleepSeconds = 60 * 2;
|
private static final long stopSeconds = 600 * 2;
|
|
// 线程间隔时间
|
private long sleepTime = sleepSeconds;
|
// 当前采集页数
|
private int currentPage = 0;
|
|
// 采集几页后,复位重新采集(最新的)
|
private int resetPageSize = 5;
|
|
public MockSpiderScheduler(){
|
wantedKeys.add(KEY_COOKIE);
|
wantedKeys.add(KEY_REFERER);
|
wantedKeys.add(KEY_FAKEID);
|
wantedKeys.add(KEY_TOKEN);
|
this.restTemplate = new RestTemplate();
|
}
|
|
/**
|
* 仅内部测试
|
*/
|
@Deprecated
|
private void scheduleRequest(){
|
try {
|
this.spider.request();
|
} catch (RequestException e) {
|
if(e.isResponseError()){
|
if(this.responseErrorCount >= 10){
|
this.responseErrorCount = 0;
|
}
|
logger.error("请求响应错误,很可能是cookie过期,提醒用户重新获取登录信息。msg=" + e.getMessage(), e);
|
this.responseErrorCount ++;
|
if(this.responseErrorCount > 2){
|
this.cookieExpired();
|
}
|
} else {
|
logger.error("远程调用其它错误,很可能是网络问题,需要尝试重试,msg=" + e.getMessage(), e);
|
}
|
}
|
}
|
|
/**
|
* 通知用户登录过期,需要重新设置cookie
|
*/
|
protected void cookieExpired(){
|
logger.info("通知用户重新登录,并提交cookie信息");
|
}
|
|
/**
|
* 业务保存采集的数据
|
* @param list
|
*/
|
protected void saveData(List<AccountArticle> list){
|
if(!StringUtils.isEmptyList(list)){
|
logger.info("采集到数量:" + list.size());
|
logger.info("数据库保存开始...");
|
for(AccountArticle article: list){
|
logger.info(article.toString());
|
}
|
} else {
|
logger.warn("采集成功一次,但空数据");
|
}
|
}
|
|
/**
|
* 设置登录参数
|
* @param content 从浏览器中粘贴的:cookie和parameter
|
* @param delimiter 每一行分隔符,默认为'\n'
|
*/
|
public void setCookieAndParameter(String content, String delimiter){
|
if(StringUtils.isEmpty(delimiter)){
|
delimiter = "\n";
|
}
|
this.keyValueParse = new KeyValueParse();
|
this.keyValueParse.setDelimiter(delimiter);
|
List<KeyValue<String, String>> list = this.keyValueParse.parse(content, this.wantedKeys);
|
if(list == null || list.size() == 0){
|
throw new IllegalArgumentException("未解析出来任何请求参数! content=" + content);
|
}
|
|
String fakeId = null;
|
String token = null;
|
int count = 5;
|
String cookie = null;
|
String referer = null;
|
|
for(KeyValue<String, String> kv : list){
|
if(kv.getKey().equals(KEY_COOKIE)){
|
cookie = kv.getValue();
|
} else if(kv.getKey().equals(KEY_FAKEID)){
|
fakeId = kv.getValue();
|
} else if (kv.getKey().equals(KEY_REFERER)){
|
referer = kv.getValue();
|
} else if (kv.getKey().equals(KEY_TOKEN)) {
|
token = kv.getValue();
|
}
|
}
|
if(StringUtils.isEmpty(fakeId) || StringUtils.isEmpty(token)
|
|| StringUtils.isEmpty(cookie) || StringUtils.isEmpty(referer)){
|
throw new IllegalArgumentException("解析返回参数不足,无法完成调用。");
|
}
|
|
this.spider = new AccountArticleSpider();
|
this.spider.setRestTemplate(this.restTemplate);
|
this.spider.setFakeId(fakeId);
|
this.spider.setToken(token);
|
this.spider.setCount(count);
|
this.spider.setBegin(this.currentPage);
|
this.spider.setCookie(cookie);
|
this.spider.setReferer(referer);
|
|
// this.scheduleRequest();
|
if(this.thread == null){
|
logger.info("创建线程...");
|
this.thread = new Thread(new GatherTask());
|
this.thread.start();
|
}
|
}
|
|
private class GatherTask implements Runnable{
|
|
private Random random = new Random(100);
|
|
@Override
|
public void run() {
|
logger.info("............线程启动:" + new Date());
|
while (!stop){
|
try {
|
List<AccountArticle> list = spider.request();
|
saveData(list);
|
sleepTime = sleepSeconds;
|
logger.info("采集第 {} 页完成", currentPage);
|
currentPage ++;
|
|
if(currentPage >= resetPageSize){
|
logger.debug("达到触发页数,复位重新开始采集(最新的)");
|
currentPage = 0;
|
}
|
|
// 重新设置采集页码
|
spider.setBegin(currentPage);
|
|
} catch (RequestException e) {
|
if(e.isResponseError()){
|
if(responseErrorCount >= 10){
|
responseErrorCount = 0;
|
}
|
logger.error("请求响应错误,很可能是cookie过期,提醒用户重新获取登录信息。msg=" + e.getMessage(), e);
|
responseErrorCount ++;
|
if(responseErrorCount > 2){
|
// 需要重新设置登录信息,强制休眠更长时间。
|
cookieExpired();
|
sleepTime = stopSeconds;
|
}
|
} else {
|
logger.error("远程调用其它错误,很可能是网络问题,需要尝试重试,msg=" + e.getMessage(), e);
|
}
|
} catch (Exception ex){
|
logger.error("其他异常,可能数据保存错误:" + ex.getMessage(), ex);
|
|
} finally {
|
int rnd = this.random.nextInt(10);
|
if(rnd == 0){
|
rnd = 2;
|
}
|
try {
|
logger.debug("线程休眠 {} 秒", sleepTime * rnd);
|
TimeUnit.SECONDS.sleep(sleepTime * rnd);
|
} catch (InterruptedException e) {}
|
}
|
}
|
logger.warn("采集线程终止!");
|
}
|
}
|
|
/**
|
* 终止程序执行。
|
*/
|
public void terminate(){
|
this.stop = true;
|
}
|
|
private Thread thread = null;
|
private boolean stop = false;
|
|
private RestTemplate restTemplate = null;
|
private AccountArticleSpider spider = null;
|
private KeyValueParse keyValueParse;
|
|
private List<String> wantedKeys = new ArrayList<>(8);
|
|
public static final String KEY_COOKIE = "cookie";
|
public static final String KEY_REFERER = "referer";
|
public static final String KEY_FAKEID = "fakeid";
|
public static final String KEY_TOKEN = "token";
|
}
|