I've been trying to get some ideas about a project I am doing for my course, and have decided to create a web crawler. Whilst searching for examples I came across this sample program and it is very similar to what I want to do, so I thought I'd try it out to see what it does and what it does not do, so that I get an idea of what features to have in my own program. The problem is, although it comes with brief instructions at the top of the program, I have no idea how to use this program. I am using Visual Studio 2008 and when I start debugging it, it seems to debug fine with no errors and a console screen flashes quickly across the screen and then I am back looking at the code window.
Thanks in advance.
/* MiniCrawler: A skeletal Web crawler. Usage: To start crawling, specify a starting URI on the command line. For example, to start at McGraw-Hill.com, use this command line: MiniCrawler http://McGraw-Hill.com */ using System.Collections.Generic; using System.Linq; using System.Text; using System; using System.Net; using System.IO; class MiniCrawler { // Find a link in a content string. static string FindLink(string htmlstr, ref int startloc) { int i; int start, end; string uri = null; string lowcasestr = htmlstr.ToLower(); i = lowcasestr.IndexOf("href=\"http", startloc); if (i != -1) { start = htmlstr.IndexOf('"', i) + 1; end = htmlstr.IndexOf('"', start); uri = htmlstr.Substring(start, end - start); startloc = end; } return uri; } static void Main(string[] args) { string link = null; string str; string answer; int curloc; // holds current location in response if (args.Length != 1) { Console.WriteLine("Usage: MiniCrawler <uri>"); return; } string uristr = args[0]; // holds current URI try { do { Console.WriteLine("Linking to " + uristr); // Create a WebRequest to the specified URI. HttpWebRequest req = (HttpWebRequest) WebRequest.Create(uristr); uristr = null; // disallow further use of this URI // Send that request and return the response. HttpWebResponse resp = (HttpWebResponse) req.GetResponse(); // From the response, obtain an input stream. Stream istrm = resp.GetResponseStream(); // Wrap the input stream in a StreamReader. StreamReader rdr = new StreamReader(istrm); // Read in the entire page. str = rdr.ReadToEnd(); curloc = 0; do { // Find the next URI to link to. link = FindLink(str, ref curloc); if (link != null) { Console.WriteLine("Link found: " + link); Console.Write("Link, More, Quit?"); answer = Console.ReadLine(); if (string.Compare(answer, "L", true) == 0) { uristr = string.Copy(link); break; } else if (string.Compare(answer, "Q", true) == 0) { break; } else if (string.Compare(answer, "M", true) == 0) { Console.WriteLine("Searching for another link."); } } else { Console.WriteLine("No link found."); break; } } while (link.Length > 0); // Close the response. resp.Close(); } while (uristr != null); } catch (WebException exc) { Console.WriteLine("Network Error: " + exc.Message + "\nStatus code: " + exc.Status); } catch (ProtocolViolationException exc) { Console.WriteLine("Protocol Error: " + exc.Message); } catch (UriFormatException exc) { Console.WriteLine("URI Format Error: " + exc.Message); } catch (NotSupportedException exc) { Console.WriteLine("Unknown Protocol: " + exc.Message); } catch (IOException exc) { Console.WriteLine("I/O Error: " + exc.Message); } Console.WriteLine("Terminating MiniCrawler."); } }