package com.walker.wxtools.spider; import com.walker.infrastructure.utils.KeyValue; import com.walker.infrastructure.utils.StringUtils; import com.walker.spider.KeyValueParse; import com.walker.wxtools.AccountArticle; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.web.client.RestTemplate; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Random; import java.util.concurrent.TimeUnit; /** * 模拟采集定时任务,不会终止。 *

通过直接设置登录参数,可以实现持续不断采集。

*

该对象是一个模拟程序,但也可以直接使用,它从给定的参数,获取公众号文章列表集合。
* 具体用法,请见对象:{@linkplain AccountArticleSpider} *

* @author 时克英 * @date 2023-04-27 */ public class MockSpiderScheduler { protected final transient Logger logger = LoggerFactory.getLogger(getClass()); private int responseErrorCount = 0; private static final long sleepSeconds = 60 * 2; private static final long stopSeconds = 600 * 2; // 线程间隔时间 private long sleepTime = sleepSeconds; // 当前采集页数 private int currentPage = 0; // 采集几页后,复位重新采集(最新的) private int resetPageSize = 5; public MockSpiderScheduler(){ wantedKeys.add(KEY_COOKIE); wantedKeys.add(KEY_REFERER); wantedKeys.add(KEY_FAKEID); wantedKeys.add(KEY_TOKEN); this.restTemplate = new RestTemplate(); } /** * 仅内部测试 */ @Deprecated private void scheduleRequest(){ try { this.spider.request(); } catch (RequestException e) { if(e.isResponseError()){ if(this.responseErrorCount >= 10){ this.responseErrorCount = 0; } logger.error("请求响应错误,很可能是cookie过期,提醒用户重新获取登录信息。msg=" + e.getMessage(), e); this.responseErrorCount ++; if(this.responseErrorCount > 2){ this.cookieExpired(); } } else { logger.error("远程调用其它错误,很可能是网络问题,需要尝试重试,msg=" + e.getMessage(), e); } } } /** * 通知用户登录过期,需要重新设置cookie */ protected void cookieExpired(){ logger.info("通知用户重新登录,并提交cookie信息"); } /** * 业务保存采集的数据 * @param list */ protected void saveData(List list){ if(!StringUtils.isEmptyList(list)){ logger.info("采集到数量:" + list.size()); logger.info("数据库保存开始..."); for(AccountArticle article: list){ logger.info(article.toString()); } } else { logger.warn("采集成功一次,但空数据"); } } /** * 设置登录参数 * @param content 从浏览器中粘贴的:cookie和parameter * @param delimiter 每一行分隔符,默认为'\n' */ public void setCookieAndParameter(String content, String delimiter){ if(StringUtils.isEmpty(delimiter)){ delimiter = "\n"; } this.keyValueParse = new KeyValueParse(); this.keyValueParse.setDelimiter(delimiter); List> list = this.keyValueParse.parse(content, this.wantedKeys); if(list == null || list.size() == 0){ throw new IllegalArgumentException("未解析出来任何请求参数! content=" + content); } String fakeId = null; String token = null; int count = 5; String cookie = null; String referer = null; for(KeyValue kv : list){ if(kv.getKey().equals(KEY_COOKIE)){ cookie = kv.getValue(); } else if(kv.getKey().equals(KEY_FAKEID)){ fakeId = kv.getValue(); } else if (kv.getKey().equals(KEY_REFERER)){ referer = kv.getValue(); } else if (kv.getKey().equals(KEY_TOKEN)) { token = kv.getValue(); } } if(StringUtils.isEmpty(fakeId) || StringUtils.isEmpty(token) || StringUtils.isEmpty(cookie) || StringUtils.isEmpty(referer)){ throw new IllegalArgumentException("解析返回参数不足,无法完成调用。"); } this.spider = new AccountArticleSpider(); this.spider.setRestTemplate(this.restTemplate); this.spider.setFakeId(fakeId); this.spider.setToken(token); this.spider.setCount(count); this.spider.setBegin(this.currentPage); this.spider.setCookie(cookie); this.spider.setReferer(referer); // this.scheduleRequest(); if(this.thread == null){ logger.info("创建线程..."); this.thread = new Thread(new GatherTask()); this.thread.start(); } } private class GatherTask implements Runnable{ private Random random = new Random(100); @Override public void run() { logger.info("............线程启动:" + new Date()); while (!stop){ try { List list = spider.request(); saveData(list); sleepTime = sleepSeconds; logger.info("采集第 {} 页完成", currentPage); currentPage ++; if(currentPage >= resetPageSize){ logger.debug("达到触发页数,复位重新开始采集(最新的)"); currentPage = 0; } // 重新设置采集页码 spider.setBegin(currentPage); } catch (RequestException e) { if(e.isResponseError()){ if(responseErrorCount >= 10){ responseErrorCount = 0; } logger.error("请求响应错误,很可能是cookie过期,提醒用户重新获取登录信息。msg=" + e.getMessage(), e); responseErrorCount ++; if(responseErrorCount > 2){ // 需要重新设置登录信息,强制休眠更长时间。 cookieExpired(); sleepTime = stopSeconds; } } else { logger.error("远程调用其它错误,很可能是网络问题,需要尝试重试,msg=" + e.getMessage(), e); } } catch (Exception ex){ logger.error("其他异常,可能数据保存错误:" + ex.getMessage(), ex); } finally { int rnd = this.random.nextInt(10); if(rnd == 0){ rnd = 2; } try { logger.debug("线程休眠 {} 秒", sleepTime * rnd); TimeUnit.SECONDS.sleep(sleepTime * rnd); } catch (InterruptedException e) {} } } logger.warn("采集线程终止!"); } } /** * 终止程序执行。 */ public void terminate(){ this.stop = true; } private Thread thread = null; private boolean stop = false; private RestTemplate restTemplate = null; private AccountArticleSpider spider = null; private KeyValueParse keyValueParse; private List wantedKeys = new ArrayList<>(8); public static final String KEY_COOKIE = "cookie"; public static final String KEY_REFERER = "referer"; public static final String KEY_FAKEID = "fakeid"; public static final String KEY_TOKEN = "token"; }