How to get all URLs of the page

How to get all URLs of the page

Topic : How to get all URLs of the page
namespace PubishApps
{
    class FetchURLsFromSite
    {
    
/************************************************
* Topic : How to fetch all URLs of the site.
* Author : kalit sikka
* For : http://eggheadcafe.com
* **********************************************/
  
        /// <summary>
        /// To fetch all URLs name from the site
        /// </summary>
        /// <param name="webPage"></param>
          public void FetchUrls( string webPage )
          {
           GetAllUrls(GetContent(webPage));
          }

          
         /// <summary>
         /// Get the content of the web page
         /// </summary>
         /// <param name="webPage"></param>
         /// <returns></returns>
          private string GetContent(string webPage)
          {
           HttpWebResponse response = null;//used to get response
           StreamReader respStream = null;//used to read response into string
           try
           {
             //create a request object using the url passed in
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
            request.Timeout = 10000;

             //go get a response from the page
            response = (HttpWebResponse)request.GetResponse();
             
             //create a streamreader object from the response
            respStream = new StreamReader(response.GetResponseStream());

             //get the contents of the page as a string and return it
            return respStream.ReadToEnd();  
           }
           catch (Exception ex)
           {
            throw ex;
           }
           finally
           {
             //close it down, we're going home!
            response.Close();
            respStream.Close();
           }
          }
          
           /// <summary>
          /// Use regular expression to filter required URLs
          /// </summary>
          /// <param name="content"></param>
        
           private void GetAllUrls(string content)
          {
          
          // Address of local LogFile
          string LocalFile = @"C:\Documents and Settings\kalit.20413\My Documents\LogFile.txt";

          //regular expression
           string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
         .*?)(?:[\s>"
"'])";
          
           //Set up regex object
           Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);

           //get the first match
           Match match = RegExpr.Match(content);

           //loop through matches
           while (match.Success)
           {

             //output the match info
            Console.WriteLine("href match: " + match.Groups[0].Value);
            WriteToLog(LocalFile, "href match: " + match.Groups[0].Value + "\r\n");

             Console.WriteLine("Url match: " + match.Groups[1].Value);
                 
             //get next match
            match = match.NextMatch();
           }
          }

          /// <summary>
          /// Write log at local machine
          /// </summary>
          /// <param name="file"></param>
          /// <param name="message"></param>
          private void WriteToLog(string file, string message)
          {
           using (StreamWriter w = File.AppendText(file))
           {
            w.WriteLine(DateTime.Now.ToString() + ": " + message);
            w.Close();
           }
          }
}

}
By Kalit Sikka   Popularity  (1030 Views)