crawler series 爬虫系列之 WebBrowser Crawler

crawler series 爬虫系列之 WebBrowser Crawler

HashFlare

写过web版本的小爬虫,但是遇到after login page总是蛋疼,需要登录并记下cookie,有时候遇到验证码就更郁闷,况且还有各种限制,比如discuz发帖器会有一个随机产生的formHash,每次要抓下来存起来,在webBrowser里面操作的好处显而易见,我们完全可以手动做完,直到目标区域,再开始自动化操作

如果想做一件事情千万不要去抱怨多么难,因为你想做的事情并非正常‘人’做的事情,而是想偷懒用机器代替人工,所以你首先必须要突破自己

webbrower基本用法不必多说,注意ScriptErrorsSuppressed=true,webBrowser_DocumentCompleted中检查Url和ReadyState,在winform app中使用记得把a的target换掉,否则会弹出(ajax加载内容中的a和iframe中的a再特别处理),或者new window处理

一、基本操作:search 高亮 事件绑定触发
参考:http://blog.csdn.net/wonsoft/article/details/5196837

二、基本信息采集(分页采集)

http://www.cnblogs.com/finallyliuyu/archive/2010/11/02/1866966.html

信号变量初始赋值
public Form1()
{
InitializeComponent();
mysignal1 = false;
mysignal2 = false;
loading = true;
subloading = true;
issuesMap = new List();
}
工作流按钮代码,点击此按钮,则爬虫自动工作
private void btnworkflow_Click(object sender, EventArgs e)
{
mysignal1 = true;
List arListCurrentPage;
foreach (string s in issuesMap)
{
loading = true;
string tmpurl = s;
webBrowser1.Navigate(tmpurl);
while (loading == true)
{
Application.DoEvents();
}
arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
if (arListCurrentPage != null)
{
InsertTitleUrlToDataBase(arListCurrentPage);
}
mysignal2 = true;
while (AnchorNextPage())
{
subloading = true;
while(subloading)
{
Application.DoEvents();

}
arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
if (arListCurrentPage != null)
{
InsertTitleUrlToDataBase(arListCurrentPage);
}
}
mysignal2 = false;

//获得当前页面的下一页链接
}
}

webbrowserCompleted更新信号
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.ReadyState ==WebBrowserReadyState.Complete)
{
if (mysignal1)
{
if (!mysignal2)
{
loading = false;
}
else
{
subloading = false;
}

}
}
点击当前页下一页的函数
private bool AnchorNextPage()
{ bool rstStatus=false;
.......中间的代码是利用正则表达式和DOM函数(如GetElementByTagName,GetElementById等)定位到当前页的下一页链接
if (htmlElemNext != null)
{
mshtml.IHTMLElement anchor = (mshtml.IHTMLElement)htmlElemNext.DomElement;
anchor.click();//模拟点击
rstStatus=true;

}
return rstStatus;
}
-----------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------------------
改进版
文章实体
public class ArticlePage
{
public string title;
public string url;
public string rawtext;
public ArticlePage()
{
title = string.Empty;
url = string.Empty;
rawtext = string.Empty;
}
}
public bool mysignal1;//btnworkflow按钮是否被点击
public bool mysignal2;
public bool mysignal3;
public bool loading;//工作流按钮与webbrowser进行交互的通信按钮
public bool subloading;
public bool subloadingPer;
信号变量初始化
public Form1()
{
InitializeComponent();
mysignal1 = false;
mysignal2 = false;
mysignal3 = false;
loading = true;
subloading = true;
subloadingPer = true;
}
}
webbrowserCompleted更新信号
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.ReadyState ==WebBrowserReadyState.Complete)
{
if (mysignal1)
{
if (!mysignal2&&!mysignal3)
{
loading = false;
}
else
{
if (mysignal2&&!mysignal3)
{
subloading = false;
}

if (mysignal3)
{
subloadingPer = false;
}
}
}
}
}
实现自动爬虫的工作流
private void btnworkflow_Click(object sender, EventArgs e)
{
mysignal1 = true;
string dirCurPageUrl = string.Empty;//用于恢复目录页当前页的网页视图
List arListCurrentPage;
foreach (string s in issuesMap)
{
loading = true;
string tmpurl = s;
webBrowser1.Navigate(tmpurl);

while (loading == true)
{
Application.DoEvents();
}
arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
if (arListCurrentPage != null)
{
dirCurPageUrl = webBrowser1.Url.ToString();
mysignal3 = true;
foreach ( ArticlePage ap in arListCurrentPage)
{
webBrowser1.Navigate(ap.url);
subloadingPer = true;
while(subloadingPer)
{
Application.DoEvents();
}
ap.rawtext = webBrowser1.DocumentText;
}
InsertTitleUrlToDataBase(arListCurrentPage);
mysignal3 = false;
webBrowser1.Navigate(dirCurPageUrl);
loading = true;
while (loading == true)
{
Application.DoEvents();
}
}
mysignal2 = true;
while (AnchorNextPage())
{
subloading = true;
while(subloading)
{
Application.DoEvents();

}
arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
if (arListCurrentPage != null)
{
dirCurPageUrl = webBrowser1.Url.ToString();
mysignal3 = true;
foreach (ArticlePage ap in arListCurrentPage)
{
webBrowser1.Navigate(ap.url);
subloadingPer = true;
while (subloadingPer)
{
Application.DoEvents();
}
ap.rawtext = webBrowser1.DocumentText;
}
InsertTitleUrlToDataBase(arListCurrentPage);
mysignal3 = false;
webBrowser1.Navigate(dirCurPageUrl);
subloading = true;
while (subloading)
{
Application.DoEvents();
}
}
}
mysignal2 = false;
//获得当前页面的下一页链接
}
}
数据库操作
private void InsertTitleUrlToDataBase(List arlist)
{
DataBaseManipulation dm = new DataBaseManipulation();
string conStr = "server=(local);database=xxxxx;uid=sa;pwd=xxx";
dm.ConstructConnection(conStr);
foreach (ArticlePage article in arlist)
{
dm.InsertToDataBase(article, "xxx");
}
}
数据库操作2
public void InsertToDataBase(ArticlePage article,string table )
{ //插入字符串
string sqlcommand=string.Format("insert into {0}(ArticlePageUrl,ArticlePageTitle,ArticlePageSource)values(@ArticlePageUrl,@ArticlePageTitle,@ArticlePageSource)",table);

//数据库参数构造与赋初值
SqlParameter ArticlePageTitle = new SqlParameter("@ArticlePageTitle", SqlDbType.VarChar, 400);
ArticlePageTitle.Value = article.title;
SqlParameter ArticlePageUrl = new SqlParameter("@ArticlePageUrl", SqlDbType.VarChar, 400);
ArticlePageUrl.Value = article.url;
SqlParameter ArticlePageSource = new SqlParameter("@ArticlePageSource", SqlDbType.Text);
ArticlePageSource.Value = article.rawtext;
SqlCommand cmd = new SqlCommand(sqlcommand, connection);
cmd.Parameters.Add(ArticlePageTitle);
cmd.Parameters.Add(ArticlePageUrl);
cmd.Parameters.Add(ArticlePageSource);

//打开数据库连接
OpenConnection();

try
{
//执行cmd操作
cmd.ExecuteNonQuery();
}
catch (System.Exception e)
{ //输出错误到记事本中
StreamWriter sw = new StreamWriter("D:\\myerror.txt", true, Encoding.Default);
sw.Write(e.Message);
sw.Close();
//一旦发生错误程序就停止运行,等待用户发现
Console.Read();
}
//关闭数据库连接
CloseConnection();
}
}
}


增强策略
解析html,可以使用HtmlAgilityPack
如果确实在爬虫中需要。也可以内嵌Mozilla Gecko(http://code.google.com/p/geckofx/)来实现,性能要比IE的好很多。
如果想要更好的性能,可以使用webkit.net(http://webkitdotnet.sourceforge.net/)。性能比gecko还要好。

三、单一ajax交互处理:

思路一:触发后timer勤奋检测:

http://bbs.csdn.net/topics/390181007

System.Timers.Timer timer = null;
private void Form1_Load(object sender, EventArgs e)
{
string url = "http://stock.finance.sina.com.cn/hkstock/finance/01398.html";
webBrowser1.Navigate(url);
timer = new System.Timers.Timer(1000);
timer.AutoReset = false;
timer.Elapsed += new System.Timers.ElapsedEventHandler(Timer_Elapsed);
}
public delegate void GetDataHandler();
private void Timer_Elapsed(object sender, ElapsedEventArgs e)
{
Invoke(new GetDataHandler(getData));
timer.Start();
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
//避免webbroswer.DocumentCompleted被多次引发
if ((e.Url != webBrowser1.Url) || (webBrowser1.ReadyState != WebBrowserReadyState.Complete))
return;
//下面写你要在页面加载完毕后执行的代码。

getSelectedCtrl();
}
private void getSelectedCtrl()
{
System.Windows.Forms.HtmlDocument doc = this.webBrowser1.Document;
System.Windows.Forms.HtmlElementCollection selectCtrls = doc.GetElementsByTagName("select");
string attName;

foreach (System.Windows.Forms.HtmlElement selectCtrl in selectCtrls)
{
attName = selectCtrl.GetAttribute("table");
if (attName.Equals("tableGetBalanceSheet"))//如果table属性是资产负债表的话,模拟引发“报表类型”的选择控件
{
foreach(HtmlElement option in selectCtrl.Children)
{
if(option.GetAttribute("value") == "zero")
{
option.SetAttribute("selected", "selected");
break;
}
}
selectCtrl.RaiseEvent("onchange");
timer.Start();
return;
}
}
}

思路二:在监测的元素加onpropertychange事件

http://xyz.cinc.biz/2013/12/csharp-webbrowser-html-ajax.html

test.html (HTML 頁面,每兩秒執行一次 AJAX,取得伺服器時間,更新在元素內 )











ajax.php (PHP頁面,輸出目前時間)
echo date(“H:i:s”);
C# 程式

private void button1_Click(object sender, EventArgs e)
{
webBrowser1.Navigate("http://127.0.0.1/test.html");
}

private void handlerAbc(Object sender, EventArgs e)
{
HtmlElement elm = webBrowser1.Document.GetElementById("abc");
if (elm == null) return;
Console.WriteLine("elm.InnerHtml(handlerAbc):" + elm.InnerHtml);
}

private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
/* 抓取網頁原始碼 方法1*/
System.IO.StreamReader getReader = new System.IO.StreamReader(webBrowser1.DocumentStream, System.Text.Encoding.Default);
string htmlA = getReader.ReadToEnd(); // htmlA 只能抓到網頁原始碼

/* 抓取網頁原始碼 方法1*/
string htmlB = webBrowser1.DocumentText; // htmlB 只能抓到網頁原始碼

/* 取得 AJAX 動態改變的內容,可用以下方法 */
HtmlElement elm = webBrowser1.Document.GetElementById("abc"); // 取的id為abc的元
Console.WriteLine("elm.InnerHtml(DocumentCompleted):" + elm.InnerHtml);
if (elm != null)
{
// 新增 handlerAbc 方法,監聽abc元素的 onpropertychange 事件,
elm.AttachEventHandler("onpropertychange", new EventHandler(handlerAbc));
}

}

思路三:高逼格不解释 http://50.6.6.11/code.html?id=6494,代码如下

using System;
using System.Windows.Forms;

namespace YingFeng.TraderHandle
{
public class AsynWebRequest
{
public Uri URL { get; set; }
private WebBrowser browser;

public AsynWebRequest(string url)
{
this.URL = new Uri(url);
}
public void Navigate()
{
if (this.URL == null)
throw new Exception("URL cannot be empty");
this.browser = new WebBrowser();
this.browser.ScriptErrorsSuppressed = true;
this.browser.DocumentCompleted += (s, e) =>
{
if (e.Url.Equals(this.browser.Url))
{
RequestCompletedEventArgs arg = new RequestCompletedEventArgs
{
Document = this.browser.Document
};
RequestCompleted(s, arg);
}
};
this.browser.Url = this.URL;
}
///

/// 获取具有指定标识符的节点内容
///

/// 元素的id或name属性 ///
public string this[string identifier]
{
get
{
if (this.RequestCompleted == null)
{
throw new Exception("The document has not been completely loaded");
}
return this.browser.Document.GetElementById(identifier).InnerText;
}
}

public delegate void RequestCompletedEventHandler(object sender, RequestCompletedEventArgs e);
public event RequestCompletedEventHandler RequestCompleted;
public class RequestCompletedEventArgs
{
public HtmlDocument Document { get; set; }
}
}
}
调用示例:
var awr = new AsynWebRequest("http://stockhtm.finance.qq.com/sstock/ggcx/300101.shtml");
awr.RequestCompleted += (s,a) => {
MessageBox.Show(a.Document.GetElementById("main-4").InnerText);
MessageBox.Show(awr["main-4"]);
};
awr.Navigate();

四、上面列举了网络上大概的现有webbrowser应用例子,但是实际中页面复杂程度超过上面方法的处理范围,比如多层次动态ajax加载,需要更积极的思路,总体的思路就是耐心的一层层在恰当时机取得需要的信息,作者很久不用winform开发,但自己突然心血来潮需要采集某宝网站的信息,折腾了数天终于如愿,被ajax动态页面折磨的同学们等我有空再细细道来

未完待续

友荐云推荐