rss· 投稿· 设为首页· 加入收藏· 繁體版
当前位置: 火魔网 » 程序开发 » VB.NET

HtmlParser.net 解析

htmlparser 是java 开源项目 c# ,vb.net 上的移植版本 在

http://netomatix.com/products/documentmanagement/htmlparsernet.aspx 上下载

涉及到个别比如百度路径乱码问题(解析http://www.baidu.com/s?wd=wpe%D6%D5%BC%AB%B0%E6 这样的网页时出现网页不对的错误)解决如下

下载源码包 修改如下

public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri)     this.m_ProtocolOutput = null;
    HttpProtocolStatus obStatus = null;     try      int redirects = 0;
     while(true)                     if (!RobotRulesParser.IsAllowed(pageUri))        if (HttpProtocol.HONOR_ROBOTSTEXT)                             throw new RobotBlockedException(pageUri);       }

                    System.Net.IPAddress addr = BlockAddr(pageUri);

      HttpResponseMgr response;
                    string url = pageUri.ToString();
      try                         response = new HttpResponseMgr(url, pageUri); // make a request       finally        UnblockAddr(addr);
      }

      int code = response.Code;

      if (code == 200)        // got a good response
       obStatus = HttpProtocolStatus.STATUS_SUCCESS;
       m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers),obStatus); // return it
       m_ProtocolOutput.Cookies = response.Cookies;
       m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion;
       return m_ProtocolOutput;       else if (code == 410)        // page is gone
                        throw new ResourceGoneException(pageUri, "Http: " + code);       else if (code >= 300 && code < 400)        // handle redirect
       if (redirects == MAX_REDIRECTS)                             throw new System.Web.HttpException("Too many redirects: " + url);                         pageUri = new System.Uri(pageUri, response.GetHeader("Location"));
       redirects++;
                        System.Diagnostics.Trace.WriteLine("redirect to " + pageUri);       else        // convert to exception
       throw new HttpError(code);      }     catch(RobotBlockedException ex)      System.Diagnostics.Trace.WriteLine(ex.Message);
     m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED);     catch(HttpError ex)      System.Diagnostics.Trace.WriteLine(ex.Message);
     obStatus = new HttpProtocolStatus(ex.Code);
     m_ProtocolOutput = new HttpProtocolOutput(null, obStatus);     catch (System.Exception e)      System.Diagnostics.Trace.WriteLine(e.Message);
     m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED);     return m_ProtocolOutput;
   }

把http包中的 HttpProtocol 此方法替换 上述方法 重新生成dll 文件就可以了

解决 htmlparser 程序未初始化的问题

Dim obParser As Parser

        Parser.SetConfigLocation(VB6.GetPath + "/config")

        Dim uri As System.Uri = New System.Uri(url)
obParser = New Parser(uri)
        obParser.Encoding = "GBK"

把 htmlparser-default.xml

和 htmlparser-site.xml 这俩个配置文件 放到bin /debug/config bin/release/config 目录下即可 当然绝对路径也是对的

咱初学vb.net 在这个地方没有必要追根究底 有任何错误失误 帮我指正

上传一个html 小例子 解析百度的

Imports Winista.Text.HtmlParser.Parser
Imports Winista.Text.HtmlParser
Imports Microsoft.VisualBasic.Compatibility
Imports Winista.Text.HtmlParser.Util
Imports Winista.Text.HtmlParser.Nodes
Imports Winista.Text.HtmlParser.Filters
Imports Winista.Text.HtmlParser.Tags
Imports Winista.Text.HtmlParser.Http.HttpProtocol
Module parse
    Sub main()
        parseBaidu("http://www.baidu.com/s?wd=%C0%AC%BB%F8%B9%B7%C8%D5%B5%C4%B0%D9%B6%C8&pn=0")
    End Sub

    Sub parseBaidu(ByVal url As String)
        Dim obParser As Parser

        Parser.SetConfigLocation(VB6.GetPath + "/config")

        Dim uri As System.Uri = New System.Uri(url)

        ' Dim httpurl As Winista.Text.HtmlParser.Http.HttpProtocol=new
        obParser = New Parser(uri)
        obParser.Encoding = "GBK"
        Console.WriteLine(uri.ToString)
        Dim nodeList As NodeList
        Dim t As NodeFilter = New TagNameFilter("table")
        Dim T_att1 As NodeFilter = New HasAttributeFilter("cellpadding", "0")
        Dim T_att2 As NodeFilter = New HasAttributeFilter("cellspacing", "0")
        Dim T_att3 As NodeFilter = New HasAttributeFilter("class", "result")
        Dim T_att4 As NodeFilter = New HasAttributeFilter("id")
        Dim NT_att5 As NodeFilter = New NotFilter(New HasAttributeFilter("mu"))         Dim textFilter As NodeFilter = New AndFilter(t, New AndFilter(NT_att5, New AndFilter(T_att1, New AndFilter(T_att2, New AndFilter(T_att3, T_att4)))))
        'Dim textFilter As NodeFilter = t
        nodeList = obParser.Parse(textFilter)
        Console.WriteLine(nodeList.Count)
        Dim i As Integer
        For i = 0 To nodeList.Count - 1
            Dim n As INode = nodeList.ElementAt(i)

            '获取子节点
            Dim child As INode = n.Children().ElementAt(0).Children().ElementAt(0)
            Dim children As NodeList = child.Children()

            If Not TypeOf (children.ElementAt(0)) Is ATag Then
                Continue For
            End If
            Dim l As ATag = children.ElementAt(0)
            Dim content As String = ""
            Dim j As Integer
            For j = 0 To children.Count - 1
                Dim c As INode = children.ElementAt(j)
                If TypeOf (c) Is TextNode Then
                    content += c.GetText
                Else
                    If c.GetText.Equals("br") And j > 1 Then
                        Exit For
                    End If
                End If
            Next

            Console.WriteLine(l.LinkText)
            Console.WriteLine(l.Link)
            Console.WriteLine(content)
        Next

        If url.IndexOf("&pn=0") <> -1 Then
            obParser.Reset()             Console.WriteLine("**********************************************")
            Dim linkt As ATag = New ATag()
            Dim linksKeyWord As NodeFilter = New NodeClassFilter(linkt.GetType)
            Dim links As NodeList = obParser.ExtractAllNodesThatMatch(linksKeyWord)

            Dim li As Integer
            For li = 0 To links.Count - 1

                Dim link As ATag = links.ElementAt(li)
                Dim l As String = link.Link
                ' Console.WriteLine(l)
                If l.IndexOf("&oq=") <> -1 Then
                    Dim key As String = link.LinkText
                    Console.WriteLine(key)
                End If
             

            Next         End If

    End Sub

这个小程序是 原来 用 java写的 移植到 vb.net 上改改语法也能跑 哈

顶一下
(0)
踩一下
(0)