I've been trying to get some ideas about a project I am doing for my course, and have decided to create a web crawler. Whilst searching for examples I came across this sample program and it is very similar to what I want to do, so I thought I'd try it out to see what it does and what it does not do, so that I get an idea of what features to have in my own program. The problem is, although it comes with brief instructions at the top of the program, I have no idea how to use this program. I am using Visual Studio 2008 and when I start debugging it, it seems to debug fine with no errors and a console screen flashes quickly across the screen and then I am back looking at the code window.
Thanks in advance.
/* MiniCrawler: A skeletal Web crawler.
Usage:
To start crawling, specify a starting
URI on the command line. For example,
to start at McGraw-Hill.com, use this
command line:
MiniCrawler http://McGraw-Hill.com
*/
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System;
using System.Net;
using System.IO;
class MiniCrawler
{
// Find a link in a content string.
static string FindLink(string htmlstr,
ref int startloc)
{
int i;
int start, end;
string uri = null;
string lowcasestr = htmlstr.ToLower();
i = lowcasestr.IndexOf("href=\"http", startloc);
if (i != -1)
{
start = htmlstr.IndexOf('"', i) + 1;
end = htmlstr.IndexOf('"', start);
uri = htmlstr.Substring(start, end - start);
startloc = end;
}
return uri;
}
static void Main(string[] args)
{
string link = null;
string str;
string answer;
int curloc; // holds current location in response
if (args.Length != 1)
{
Console.WriteLine("Usage: MiniCrawler <uri>");
return;
}
string uristr = args[0]; // holds current URI
try
{
do
{
Console.WriteLine("Linking to " + uristr);
// Create a WebRequest to the specified URI.
HttpWebRequest req = (HttpWebRequest)
WebRequest.Create(uristr);
uristr = null; // disallow further use of this URI
// Send that request and return the response.
HttpWebResponse resp = (HttpWebResponse)
req.GetResponse();
// From the response, obtain an input stream.
Stream istrm = resp.GetResponseStream();
// Wrap the input stream in a StreamReader.
StreamReader rdr = new StreamReader(istrm);
// Read in the entire page.
str = rdr.ReadToEnd();
curloc = 0;
do
{
// Find the next URI to link to.
link = FindLink(str, ref curloc);
if (link != null)
{
Console.WriteLine("Link found: " + link);
Console.Write("Link, More, Quit?");
answer = Console.ReadLine();
if (string.Compare(answer, "L", true) == 0)
{
uristr = string.Copy(link);
break;
}
else if (string.Compare(answer, "Q", true) == 0)
{
break;
}
else if (string.Compare(answer, "M", true) == 0)
{
Console.WriteLine("Searching for another link.");
}
}
else
{
Console.WriteLine("No link found.");
break;
}
} while (link.Length > 0);
// Close the response.
resp.Close();
} while (uristr != null);
}
catch (WebException exc)
{
Console.WriteLine("Network Error: " + exc.Message +
"\nStatus code: " + exc.Status);
}
catch (ProtocolViolationException exc)
{
Console.WriteLine("Protocol Error: " + exc.Message);
}
catch (UriFormatException exc)
{
Console.WriteLine("URI Format Error: " + exc.Message);
}
catch (NotSupportedException exc)
{
Console.WriteLine("Unknown Protocol: " + exc.Message);
}
catch (IOException exc)
{
Console.WriteLine("I/O Error: " + exc.Message);
}
Console.WriteLine("Terminating MiniCrawler.");
}
}

New Topic/Question
This topic is locked




MultiQuote


|