Java开源爬虫框架WebCollector教程——爬取新浪微博
本教程给出了一个使用WebCollector模拟登陆并爬取新浪微博的示例,完整的新浪微博抓取示例工程可加群250108697或345054141在群文件中获取。
1.依赖jar包:
本教程需要两套jar包,WebCollector核心jar包和selenium的jar包。
WebCollector最新jar包可在WebCollector github主页下载。
selenium的maven依赖:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.4.0</version>
</dependency>
2.代码:
- 利用Selenium获取登陆新浪微博weibo.cn的cookie(WeiboCN.java)
- 利用WebCollector和获取的cookie爬取新浪微博并抽取数据(WeiboCrawler.java)
WeiboCN.java
import java.util.Set;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import com.gargoylesoftware.htmlunit.BrowserVersion;
/**
* 该登录算法适用时间: 2017-6-2 —— ?
* 利用Selenium获取登陆新浪微博weibo.cn的cookie
*
* @author hu
*/
public class WeiboCN {
/**
* 获取新浪微博的cookie,这个方法针对weibo.cn有效,对weibo.com无效 weibo.cn以明文形式传输数据,请使用小号
*
* @param username 新浪微博用户名
* @param password 新浪微博密码
* @return
* @throws Exception
*/
public static String getSinaCookie(String username, String password) throws Exception {
StringBuilder sb = new StringBuilder();
HtmlUnitDriver driver = new HtmlUnitDriver(BrowserVersion.INTERNET_EXPLORER);
driver.setJavascriptEnabled(true);
driver.get("https://passport.weibo.cn/signin/login");
driver.executeScript("document.getElementById('loginWrapper').style.display = 'block'");
WebElement mobile = driver.findElementByCssSelector("input#loginName");
mobile.sendKeys(username);
WebElement pass = driver.findElementByCssSelector("input#loginPassword");
pass.sendKeys(password);
WebElement submit = driver.findElementByCssSelector("a#loginAction");
submit.click();
String result = concatCookie(driver);
System.out.println("Get Cookie: " + result);
driver.close();
if (result.contains("SUB")) {
return result;
} else {
throw new Exception("weibo login failed");
}
}
public static String concatCookie(HtmlUnitDriver driver) {
Set<Cookie> cookieSet = driver.manage().getCookies();
StringBuilder sb = new StringBuilder();
for (Cookie cookie : cookieSet) {
sb.append(cookie.getName() + "=" + cookie.getValue() + ";");
}
String result = sb.toString();
return result;
}
}
WeiboCrawler.java
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 该登录算法适用时间: 2017-6-2 —— ?
* 利用WebCollector和获取的cookie爬取新浪微博并抽取数据
*
* @author hu
*/
public class WeiboCrawler extends BreadthCrawler {
String cookie;
public WeiboCrawler(String crawlPath) throws Exception {
super(crawlPath, false);
/* 获取新浪微博的cookie,账号密码以明文形式传输,请使用小号 */
cookie = WeiboCN.getSinaCookie("你的用户名", "你的密码");
//设置线程数
setThreads(3);
//设置每个线程的爬取间隔
getConf().setExecuteInterval(1000);
}
@Override
public Page getResponse(CrawlDatum crawlDatum) throws Exception {
HttpRequest request = new HttpRequest(crawlDatum);
request.setCookie(cookie);
return request.responsePage();
}
@Override
public void visit(Page page, CrawlDatums next) {
int pageNum = Integer.valueOf(page.meta("pageNum"));
/* 抽取微博 */
Elements weibos = page.select("div.c[id]");
for (Element weibo : weibos) {
System.out.println("第" + pageNum + "页\t" + weibo.text());
}
}
public static void main(String[] args) throws Exception {
WeiboCrawler crawler = new WeiboCrawler("crawl_weibo");
/* 对某人微博前5页进行爬取 */
for (int i = 1; i <= 5; i++) {
String seedUrl = "http://weibo.cn/zhouhongyi?vt=4&page=" + i;
crawler.addSeedAndReturn(seedUrl).meta("pageNum", i);
}
crawler.start(1);
}
}