Dieses Beispiel zeigt wie man mit Hilfe von Regular Expressions Hyperlinks aus einem HTML-Dokument parst. Danach wird den Links bis zu einer bestimmten Tiefe gefolgt und der Vorgang wiederholt. Die gefundenen Links (es werden nicht nur Hyperlinks sondern auch Frames, Iframes und Image Maps unterstützt) werden in einem ListView angezeigt.

 

Grund dafür dass ich diesen Code erstellt habe war dass ich eine Google Sitemap für diese Homepage generieren musste.

 

Das vollständige Codesample (Visual Studio 2005 Projekt) könnt ihr hier herunterladen:
ChrischSampleHyperlinkParser.zip

 

Das Beispielprogramm (ausführbare EXE) gibt es hier:

ChrischSampleHyperlinkParserDemo.zip

 

Dieser Artikel wurde auch bei Codeproject veröffentlich:
http://www.codeproject.com/useritems/HyperlinkParser.asp

 

 

 

 

protected Regex m_rxBaseHref;

 

protected Regex m_rxHref;

protected Regex m_rxFrame;

protected Regex m_rxIframe;

protected Regex m_rxArea;

 

protected List<string> m_strListUrlsAdded;

protected List<string> m_strListUrlsFollowed;

 

public Form1()

{

    InitializeComponent();

 

    RegexOptions rxOpt = RegexOptions.Singleline |

                            RegexOptions.Compiled |

                            RegexOptions.IgnoreCase;

 

    //We have to create our regular expression for parsing hyperlinks, base hrefs an so on

    m_rxHref = new Regex("<a[^>]*href=(\"|')(.*?)\\1[^>]*>(.*?)</a>", rxOpt);

    m_rxFrame = new Regex("<frame[^>]*src=(\"|')(.*?)\\1[^>]*>", rxOpt);

    m_rxIframe = new Regex("<iframe[^>]*src=(\"|')(.*?)\\1[^>]*>", rxOpt);

    m_rxArea = new Regex("<area[^>]*href=(\"|')(.*?)\\1[^>]*>", rxOpt);

    m_rxBaseHref = new Regex("<base[^>]* href=(\"|')(.*?)\\1[^>]*>", rxOpt);

}

 

private void btnReadFromUrl_Click(object sender, EventArgs e)

{

    Cursor = Cursors.WaitCursor;

 

    //Clear existing URLs

    lvUrls.Items.Clear();

 

    m_strListUrlsAdded = new List<string>();

    m_strListUrlsFollowed = new List<string>();

 

    ReadUrls(tbUrl.Text, tbUrl.Text, ref m_strListUrlsAdded,

        ref m_strListUrlsFollowed, (int)numMaxDepth.Value, 0);

 

    Cursor = Cursors.Default;

}

 

protected void ReadUrls(string strURL, string strStartBase,

                ref List<string> strUrlsAdded,

                ref List<string> strUrlsFollowed,

                int iMaximumDepth,

                int iCurrentDepth)

{

    //Increase the depth. If we reach the maximum depth: return

    if (++iCurrentDepth == iMaximumDepth)

    {

        return;

    }

 

    //No we create the WebRequest and get the response. If something fails

    //we return

 

    HttpWebRequest req = null;

 

    try

    {

        req = HttpWebRequest.Create(strURL) as HttpWebRequest;

    }

    catch (Exception) { }

 

    if(req == null)

    {

        return;

    }

 

    req.Method = "GET";

 

    HttpWebResponse res = null;

    try

    {

        res = req.GetResponse() as HttpWebResponse;

    }

    catch (Exception){}

 

    if(res == null || res.StatusCode != HttpStatusCode.OK)

    {

        return;

    }

 

    Stream s = res.GetResponseStream();

 

    StreamReader sr = new StreamReader(s);

 

    //Read the whole content of the response stream into a string

    string strHTML = sr.ReadToEnd();

 

    sr.Close();

    sr.Dispose();

    sr = null;

 

    s.Close();

    s.Dispose();

    s = null;

 

    int iPos, iPos2;

 

    /*After getting a response the Address property of the

    web request contains the real URL from the served document.

    This is automatically done by reading the HTTP header

    "Content-Location"

    For example, you request the URL "http://www.test.abc/test

    But "test" is just a directory, so the server returns you

    the index document of this directory. In this case the

    address property is "http://www.test.abc/test/index.htm"

    */

    //We need the base to follow relative URLs

    string strBase = req.Address.AbsoluteUri;           

 

    //If the base contains a query string we remove that string

    //because we don't need it.

    iPos = strBase.IndexOf('?');

    if(iPos > -1)

    {

        strBase = strBase.Substring(0, iPos);

    }

 

    //Assure that the base ends with a slash

    if(strBase[strBase.Length - 1] != '/')

    {

        iPos = strBase.LastIndexOf('/');

        if(iPos < 0)

        {

            return;

        }

 

        strBase = strBase.Substring(0, iPos + 1);

    }

 

    iPos = strBase.IndexOf("://");

    if(iPos < 0)

    {

        return;

    }

 

    iPos = strBase.IndexOf('/', iPos + 3);

    if(iPos < 0)

    {

        return;

    }

 

    //We need the base host URL for hyperlinks that start with a slash

    string strBaseHostUrl = strBase.Substring(0, iPos + 1);        

 

    //Test if the HTML contains a base href

    Match matchBaseHref = m_rxBaseHref.Match(strHTML);

    if (matchBaseHref.Success)

    {

        string strHtmlBase = matchBaseHref.Groups[2].Value.Trim();

        if(strHtmlBase.StartsWith("/"))

        {

            strBase = strBaseHostUrl + strHtmlBase.Substring(1);

        }

        else

        {

            strBase = strHtmlBase;

        }

    }

 

    //This dictionary contains all hyperlinks and their

    //associated "texts" (anything between <a> and </a>)

    Dictionary<string, string> dictHrefs = new Dictionary<string, string>();

 

    MatchCollection matchesHref = m_rxHref.Matches(strHTML);

    AddHrefMatches(matchesHref, ref dictHrefs);

 

    MatchCollection matchesFrame = m_rxFrame.Matches(strHTML);

    AddHrefMatches(matchesFrame, ref dictHrefs);

 

    MatchCollection matchesIframe = m_rxIframe.Matches(strHTML);

    AddHrefMatches(matchesIframe, ref dictHrefs);

 

    MatchCollection matchesArea = m_rxArea.Matches(strHTML);

    AddHrefMatches(matchesArea, ref dictHrefs);

 

    //Now we iterate through all Hyperlinks we found

    foreach (string strUrlFound in dictHrefs.Keys)

    {

        string strUrlNew = strUrlFound;

 

        //Skip this links if it starts with ftp://, news://, mailto:,

        //javascript:

        if (IsAbsoluteUrl(strUrlNew) && !IsHttpUrl(strUrlNew))

        {

            continue;

        }

 

        //if this isn't an absolute URL

        if (!IsHttpUrl(strUrlNew))

        {

            if (strUrlNew.StartsWith("/"))

            {

                strUrlNew = strBaseHostUrl + strUrlNew.Substring(1);

            }

            else

            {

                strUrlNew = strBase + strUrlNew;

            }

        }

 

        //Now we remove all parent paths

        while ((iPos = strUrlNew.IndexOf("../")) > -1)

        {

            iPos2 = strUrlNew.Substring(0, iPos).LastIndexOf('/');

            iPos2 = strUrlNew.Substring(0, iPos2).LastIndexOf('/');

 

            strUrlNew = strUrlNew.Substring(0, iPos2) +

                "/" + strUrlNew.Substring(iPos + 3);

        }

 

        //if the URL doesn't start with our starting base

        //(the address we entered into our textbox) then

        //skip this

        if (!strUrlNew.StartsWith(strStartBase))

        {

            continue;

        }

 

        //If we haven't added the URL yet to our listview do it now

        if (!strUrlsAdded.Contains(strUrlNew))

        {

            ListViewItem lvi = new ListViewItem(new string[]{

                strUrlNew,

                dictHrefs[strUrlFound]

            });

            lvUrls.Items.Add(lvi);

 

            strUrlsAdded.Add(strUrlNew);

        }

 

        //Follow this URL if not alreay done

        if (!strUrlsFollowed.Contains(strUrlNew))

        {

            strUrlsFollowed.Add(strUrlNew);

            ReadUrls(strUrlNew, strStartBase,

                ref strUrlsAdded, ref strUrlsFollowed,

                iMaximumDepth, iCurrentDepth);

        }

    }

}