Java 开源爬虫框架 WebCollector 教程——爬取微信公众号

by briefcopy · 2017年11月27日

本教程根据指定的微信公众号名称，从搜狗微信搜索中爬取微信公众号发布的最新文章。由于微信公众号文章页的URL为临时URL，该爬虫程序基于WebCollector 2.52的新特性NextFilter设计了新的去重方式。

运行结果

日志：
TIM截图20170604184710

历史记录：
TIM截图20170604184843


package com.datahref;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.nextfilter.HashSetNextFilter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 本教程为微信公众号文章抓取示例 示例中抓取的公众号标题（用于去重）存放于文件中，仅供参考 线上系统请老老实实用数据库
 *
 * @author hu
 */
public class WxAccountCrawler extends BreadthCrawler {

    public static final Logger LOG = LoggerFactory.getLogger(WxAccountCrawler.class);

    protected String historyKeysPath;
    protected BufferedWriter historyKeysWriter;

    public WxAccountCrawler(String crawlPath, String historyKeysPath) throws Exception {
        super(crawlPath, false);
        this.historyKeysPath = historyKeysPath;
        LOG.info("initializing history-keys-filter ......");
        this.setNextFilter(new HistoryKeysFilter(historyKeysPath));
        LOG.info("creating history-keys-writer");
        historyKeysWriter = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(historyKeysPath, true), "utf-8"));

    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        String account = page.meta("account");

        if (page.matchType("account_search")) {
            //对于账号搜索页面

            //抽取公众号文章列表页URL
            Element accountLinkEle = page.select("p.tit>a").first();
            //防止搜索结果为空
            if (accountLinkEle == null) {
                LOG.info("公众号\"" + account + "\"不存在，请给出准确的公众号名");
                return;
            }
            //防止公众号名错误
            String detectedAccount = accountLinkEle.text().trim();
            if (!account.equals(detectedAccount)) {
                LOG.info("公众号\"" + account + "\"与搜索结果\"" + detectedAccount + "\"名称不符，请给出准确的公众号名");
                return;
            }
            String accountUrl = accountLinkEle.attr("abs:href");
            next.add(new CrawlDatum(accountUrl, "article_list").meta("account", account));

        } else if (page.matchType("article_list")) {
            //对于公众号文章列表页

            String prefix = "msgList = ";
            String suffix = "seajs.use";
            int startIndex = page.html().indexOf(prefix) + prefix.length();
            int endIndex = page.html().indexOf(suffix);
            String jsonStr = page.html().substring(startIndex, endIndex);
            JSONObject json = new JSONObject(jsonStr);
            JSONArray articleJSONArray = json.getJSONArray("list");
            for (int i = 0; i < articleJSONArray.length(); i++) {
                JSONObject articleJSON = articleJSONArray.getJSONObject(i).getJSONObject("app_msg_ext_info");
                String title = articleJSON.getString("title").trim();
                String key = account + "_" + title;
                String articleUrl = "http://mp.weixin.qq.com" + articleJSON.getString("content_url").replace("&", "&");
                next.add(new CrawlDatum(articleUrl, "article").key(key).meta("account", account));
            }

        } else if (page.matchType("article")) {
            //对于文章页
            //抽取标题、内容等信息，此处仅print少数信息作为参考
            String title = page.select("h2.rich_media_title").first().text().trim();
            String date = page.select("em#post-date").first().text().trim();
            String content = page.select("div.rich_media_content").first().text().trim();

            try {
                writeHistoryKey(page.key());
                JSONObject articleJSON = new JSONObject();
                articleJSON.append("account", account)
                        .append("title", title)
                        .append("date", date)
                        .append("content", content);
                System.out.println(articleJSON);
            } catch (IOException ex) {
                LOG.info("writer exception", ex);
            }

        }
    }

    public synchronized void writeHistoryKey(String key) throws IOException {
        historyKeysWriter.write(key + "\n");
    }

    @Override
    public void start(int depth) throws Exception {
        super.start(depth);
        //关闭文件，保存history keys
        historyKeysWriter.close();
        LOG.info("save history keys");
    }

    public void addAccount(String account) throws UnsupportedEncodingException {
        String seedUrl = "http://weixin.sogou.com/weixin?type=1&"
                + "s_from=input&ie=utf8&query=" + URLEncoder.encode(account, "utf-8");
        CrawlDatum seed = new CrawlDatum(seedUrl, "account_search").meta("account", account);
        addSeed(seed);
    }

    //该示例读取文件中的key进行文章去重
    //线上应用请老老实实用数据库
    public class HistoryKeysFilter extends HashSetNextFilter {

        //读取历史文章标题，用于去重
        public HistoryKeysFilter(String historyKeysPath) throws Exception {
            File historyFile = new File(historyKeysPath);
            if (historyFile.exists()) {
                FileInputStream fis = new FileInputStream(historyKeysPath);
                BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "utf-8"));
                String line;
                while ((line = reader.readLine()) != null) {
                    this.add(line);
                }
                reader.close();
            }
        }
    }

    public static void main(String[] args) throws Exception {
        WxAccountCrawler crawler = new WxAccountCrawler("crawl_weixin", "wx_history.txt");
        crawler.addAccount("机器之心");
        crawler.addAccount("ZEALER订阅号");
        crawler.setThreads(5);
        crawler.start(10);
    }

}

You may also like...

发表回复取消回复