Java 开源爬虫框架 WebCollector 教程——爬取微信公众号

本教程根据指定的微信公众号名称,从搜狗微信搜索中爬取微信公众号发布的最新文章。由于微信公众号文章页的URL为临时URL,该爬虫程序基于WebCollector 2.52的新特性NextFilter设计了新的去重方式。

运行结果

日志:
TIM截图20170604184710

历史记录:
TIM截图20170604184843


package com.datahref; import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; import cn.edu.hfut.dmic.webcollector.plugin.nextfilter.HashSetNextFilter; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import org.json.JSONArray; import org.json.JSONObject; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * 本教程为微信公众号文章抓取示例 示例中抓取的公众号标题(用于去重)存放于文件中,仅供参考 线上系统请老老实实用数据库 * * @author hu */ public class WxAccountCrawler extends BreadthCrawler { public static final Logger LOG = LoggerFactory.getLogger(WxAccountCrawler.class); protected String historyKeysPath; protected BufferedWriter historyKeysWriter; public WxAccountCrawler(String crawlPath, String historyKeysPath) throws Exception { super(crawlPath, false); this.historyKeysPath = historyKeysPath; LOG.info("initializing history-keys-filter ......"); this.setNextFilter(new HistoryKeysFilter(historyKeysPath)); LOG.info("creating history-keys-writer"); historyKeysWriter = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(historyKeysPath, true), "utf-8")); } @Override public void visit(Page page, CrawlDatums next) { String account = page.meta("account"); if (page.matchType("account_search")) { //对于账号搜索页面 //抽取公众号文章列表页URL Element accountLinkEle = page.select("p.tit>a").first(); //防止搜索结果为空 if (accountLinkEle == null) { LOG.info("公众号\"" + account + "\"不存在,请给出准确的公众号名"); return; } //防止公众号名错误 String detectedAccount = accountLinkEle.text().trim(); if (!account.equals(detectedAccount)) { LOG.info("公众号\"" + account + "\"与搜索结果\"" + detectedAccount + "\"名称不符,请给出准确的公众号名"); return; } String accountUrl = accountLinkEle.attr("abs:href"); next.add(new CrawlDatum(accountUrl, "article_list").meta("account", account)); } else if (page.matchType("article_list")) { //对于公众号文章列表页 String prefix = "msgList = "; String suffix = "seajs.use"; int startIndex = page.html().indexOf(prefix) + prefix.length(); int endIndex = page.html().indexOf(suffix); String jsonStr = page.html().substring(startIndex, endIndex); JSONObject json = new JSONObject(jsonStr); JSONArray articleJSONArray = json.getJSONArray("list"); for (int i = 0; i < articleJSONArray.length(); i++) { JSONObject articleJSON = articleJSONArray.getJSONObject(i).getJSONObject("app_msg_ext_info"); String title = articleJSON.getString("title").trim(); String key = account + "_" + title; String articleUrl = "http://mp.weixin.qq.com" + articleJSON.getString("content_url").replace("&", "&"); next.add(new CrawlDatum(articleUrl, "article").key(key).meta("account", account)); } } else if (page.matchType("article")) { //对于文章页 //抽取标题、内容等信息,此处仅print少数信息作为参考 String title = page.select("h2.rich_media_title").first().text().trim(); String date = page.select("em#post-date").first().text().trim(); String content = page.select("div.rich_media_content").first().text().trim(); try { writeHistoryKey(page.key()); JSONObject articleJSON = new JSONObject(); articleJSON.append("account", account) .append("title", title) .append("date", date) .append("content", content); System.out.println(articleJSON); } catch (IOException ex) { LOG.info("writer exception", ex); } } } public synchronized void writeHistoryKey(String key) throws IOException { historyKeysWriter.write(key + "\n"); } @Override public void start(int depth) throws Exception { super.start(depth); //关闭文件,保存history keys historyKeysWriter.close(); LOG.info("save history keys"); } public void addAccount(String account) throws UnsupportedEncodingException { String seedUrl = "http://weixin.sogou.com/weixin?type=1&" + "s_from=input&ie=utf8&query=" + URLEncoder.encode(account, "utf-8"); CrawlDatum seed = new CrawlDatum(seedUrl, "account_search").meta("account", account); addSeed(seed); } //该示例读取文件中的key进行文章去重 //线上应用请老老实实用数据库 public class HistoryKeysFilter extends HashSetNextFilter { //读取历史文章标题,用于去重 public HistoryKeysFilter(String historyKeysPath) throws Exception { File historyFile = new File(historyKeysPath); if (historyFile.exists()) { FileInputStream fis = new FileInputStream(historyKeysPath); BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "utf-8")); String line; while ((line = reader.readLine()) != null) { this.add(line); } reader.close(); } } } public static void main(String[] args) throws Exception { WxAccountCrawler crawler = new WxAccountCrawler("crawl_weixin", "wx_history.txt"); crawler.addAccount("机器之心"); crawler.addAccount("ZEALER订阅号"); crawler.setThreads(5); crawler.start(10); } }

You may also like...

发表评论

邮箱地址不会被公开。 必填项已用*标注

推荐文章
加QQ群下载完整Eclipse项目

WebCollector网络爬虫:250108697(已满)

WebCollector网络爬虫2群:345054141

热门