htmlparser 是java 开源项目 c# ,vb.net 上的移植版本 在
http://netomatix.com/products/documentmanagement/htmlparsernet.aspx 上下载
涉及到个别比如百度路径乱码问题(解析http://www.baidu.com/s?wd=wpe%D6%D5%BC%AB%B0%E6 这样的网页时出现网页不对的错误)解决如下
下载源码包 修改如下
public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri)
this.m_ProtocolOutput = null;
HttpProtocolStatus obStatus = null;
try
int redirects = 0;
while(true)
if (!RobotRulesParser.IsAllowed(pageUri))
if (HttpProtocol.HONOR_ROBOTSTEXT)
throw new RobotBlockedException(pageUri);
}
System.Net.IPAddress addr = BlockAddr(pageUri);
HttpResponseMgr response;
string url = pageUri.ToString();
try
response = new HttpResponseMgr(url, pageUri); // make a request
finally
UnblockAddr(addr);
}
int code = response.Code;
if (code == 200)
// got a good response
obStatus = HttpProtocolStatus.STATUS_SUCCESS;
m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers),obStatus); // return it
m_ProtocolOutput.Cookies = response.Cookies;
m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion;
return m_ProtocolOutput;
else if (code == 410)
// page is gone
throw new ResourceGoneException(pageUri, "Http: " + code);
else if (code >= 300 && code < 400)
// handle redirect
if (redirects == MAX_REDIRECTS)
throw new System.Web.HttpException("Too many redirects: " + url);
pageUri = new System.Uri(pageUri, response.GetHeader("Location"));
redirects++;
System.Diagnostics.Trace.WriteLine("redirect to " + pageUri);
else
// convert to exception
throw new HttpError(code);
}
catch(RobotBlockedException ex)
System.Diagnostics.Trace.WriteLine(ex.Message);
m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED);
catch(HttpError ex)
System.Diagnostics.Trace.WriteLine(ex.Message);
obStatus = new HttpProtocolStatus(ex.Code);
m_ProtocolOutput = new HttpProtocolOutput(null, obStatus);
catch (System.Exception e)
System.Diagnostics.Trace.WriteLine(e.Message);
m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED);
return m_ProtocolOutput;
}
把http包中的 HttpProtocol 此方法替换 上述方法 重新生成dll 文件就可以了
解决 htmlparser 程序未初始化的问题
Dim obParser As Parser
Parser.SetConfigLocation(VB6.GetPath + "/config")
Dim uri As System.Uri = New System.Uri(url)
obParser = New Parser(uri)
obParser.Encoding = "GBK"
把 htmlparser-default.xml
和 htmlparser-site.xml 这俩个配置文件 放到bin /debug/config bin/release/config 目录下即可 当然绝对路径也是对的
咱初学vb.net 在这个地方没有必要追根究底 有任何错误失误 帮我指正
上传一个html 小例子 解析百度的
Imports Winista.Text.HtmlParser.Parser
Imports Winista.Text.HtmlParser
Imports Microsoft.VisualBasic.Compatibility
Imports Winista.Text.HtmlParser.Util
Imports Winista.Text.HtmlParser.Nodes
Imports Winista.Text.HtmlParser.Filters
Imports Winista.Text.HtmlParser.Tags
Imports Winista.Text.HtmlParser.Http.HttpProtocol
Module parse
Sub main()
parseBaidu("http://www.baidu.com/s?wd=%C0%AC%BB%F8%B9%B7%C8%D5%B5%C4%B0%D9%B6%C8&pn=0")
End Sub
Sub parseBaidu(ByVal url As String)
Dim obParser As Parser
Parser.SetConfigLocation(VB6.GetPath + "/config")
Dim uri As System.Uri = New System.Uri(url)
' Dim httpurl As Winista.Text.HtmlParser.Http.HttpProtocol=new
obParser = New Parser(uri)
obParser.Encoding = "GBK"
Console.WriteLine(uri.ToString)
Dim nodeList As NodeList
Dim t As NodeFilter = New TagNameFilter("table")
Dim T_att1 As NodeFilter = New HasAttributeFilter("cellpadding", "0")
Dim T_att2 As NodeFilter = New HasAttributeFilter("cellspacing", "0")
Dim T_att3 As NodeFilter = New HasAttributeFilter("class", "result")
Dim T_att4 As NodeFilter = New HasAttributeFilter("id")
Dim NT_att5 As NodeFilter = New NotFilter(New HasAttributeFilter("mu"))
Dim textFilter As NodeFilter = New AndFilter(t, New AndFilter(NT_att5, New AndFilter(T_att1, New AndFilter(T_att2, New AndFilter(T_att3, T_att4)))))
'Dim textFilter As NodeFilter = t
nodeList = obParser.Parse(textFilter)
Console.WriteLine(nodeList.Count)
Dim i As Integer
For i = 0 To nodeList.Count - 1
Dim n As INode = nodeList.ElementAt(i)
'获取子节点
Dim child As INode = n.Children().ElementAt(0).Children().ElementAt(0)
Dim children As NodeList = child.Children()
If Not TypeOf (children.ElementAt(0)) Is ATag Then
Continue For
End If
Dim l As ATag = children.ElementAt(0)
Dim content As String = ""
Dim j As Integer
For j = 0 To children.Count - 1
Dim c As INode = children.ElementAt(j)
If TypeOf (c) Is TextNode Then
content += c.GetText
Else
If c.GetText.Equals("br") And j > 1 Then
Exit For
End If
End If
Next
Console.WriteLine(l.LinkText)
Console.WriteLine(l.Link)
Console.WriteLine(content)
Next
If url.IndexOf("&pn=0") <> -1 Then
obParser.Reset()
Console.WriteLine("**********************************************")
Dim linkt As ATag = New ATag()
Dim linksKeyWord As NodeFilter = New NodeClassFilter(linkt.GetType)
Dim links As NodeList = obParser.ExtractAllNodesThatMatch(linksKeyWord)
Dim li As Integer
For li = 0 To links.Count - 1
Dim link As ATag = links.ElementAt(li)
Dim l As String = link.Link
' Console.WriteLine(l)
If l.IndexOf("&oq=") <> -1 Then
Dim key As String = link.LinkText
Console.WriteLine(key)
End If
Next End If
End Sub
这个小程序是 原来 用 java写的 移植到 vb.net 上改改语法也能跑 哈