Articles
FAQs
Login
How to get all URLs of the page
By Kalit Sikka
INSTANTLY dtSearch TERABYTES OF POPULAR DATA TYPES; hundreds of reviews, etc.!
How to get all URLs of the page
Topic : How to get all URLs of the page
namespace
PubishApps
{
class
FetchURLsFromSite
{
/************************************************
* Topic : How to fetch all URLs of the site.
* Author : kalit sikka
* For : http://eggheadcafe.com
* **********************************************/
/// <summary>
/// To fetch all URLs name from the site
/// </summary>
/// <param name="webPage"></param>
public
void
FetchUrls(
string
webPage )
{
GetAllUrls(GetContent(webPage));
}
/// <summary>
/// Get the content of the web page
/// </summary>
/// <param name="webPage"></param>
/// <returns></returns>
private
string
GetContent(
string
webPage)
{
HttpWebResponse response =
null
;
//used to get response
StreamReader respStream =
null
;
//used to read response into string
try
{
//create a request object using the url passed in
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
request.Timeout = 10000;
//go get a response from the page
response = (HttpWebResponse)request.GetResponse();
//create a streamreader object from the response
respStream =
new
StreamReader(response.GetResponseStream());
//get the contents of the page as a string and return it
return
respStream.ReadToEnd();
}
catch
(Exception ex)
{
throw
ex;
}
finally
{
//close it down, we're going home!
response.Close();
respStream.Close();
}
}
/// <summary>
/// Use regular expression to filter required URLs
/// </summary>
/// <param name="content"></param>
private
void
GetAllUrls(
string
content)
{
// Address of local LogFile
string
LocalFile =
@"C:\Documents and Settings\kalit.20413\My Documents\LogFile.txt"
;
//regular expression
string
pattern =
@"(?:href\s*=)(?:[\s"
"']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
.*?)(?:[\s>"
"'])"
;
//Set up regex object
Regex RegExpr =
new
Regex(pattern, RegexOptions.IgnoreCase);
//get the first match
Match match = RegExpr.Match(content);
//loop through matches
while
(match.Success)
{
//output the match info
Console.WriteLine(
"href match: "
+ match.Groups[0].Value);
WriteToLog(LocalFile,
"href match: "
+ match.Groups[0].Value +
"\r\n"
);
Console.WriteLine(
"Url match: "
+ match.Groups[1].Value);
//get next match
match = match.NextMatch();
}
}
/// <summary>
/// Write log at local machine
/// </summary>
/// <param name="file"></param>
/// <param name="message"></param>
private
void
WriteToLog(
string
file,
string
message)
{
using
(StreamWriter w = File.AppendText(file))
{
w.WriteLine(DateTime.Now.ToString() +
": "
+ message);
w.Close();
}
}
}
}
Popularity
(
865 Views
)