The first class that is used to extract links form a website. The links extracted will be transferred into text file.
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
public class SimpleWebCrawler {
public static void main(String[] args) throws IOException {
try {
URL my_url = new URL("http://theworldaccordingtothisgirl.blogspot.com/");
BufferedReader br = new BufferedReader(new InputStreamReader(
my_url.openStream()));
String strTemp = "";
while (null != (strTemp = br.readLine())) {
System.out.println(strTemp);
}
} catch (Exception ex) {
ex.printStackTrace();
}
System.out.println("\n");
System.out.println("\n");
System.out.println("\n");
Validate.isTrue(args.length == 0, "usage: supply url to crawl");
String url = "http://theworldaccordingtothisgirl.blogspot.com/";
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
System.out.println("\n");
BufferedWriter bw = new BufferedWriter(new FileWriter("abc.txt"));
for (Element link : links) {
print(" %s ", link.attr("abs:href"), trim(link.text(), 35));
bw.write(link.attr("abs:href"));
bw.write(System.getProperty("line.separator"));
}
bw.flush();
bw.close();
}
private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width - 1) + ".";
else
return s;
}
}
the second class uses the link gathered above and extract their contents and transfer into a text file.
import net.htmlparser.jericho.*;
import java.util.*;
import java.io.*;
import java.net.*;
public class RenderToText {
public static void main(String[] args) throws IOException {
//Starting to write files
FileReader fr = new FileReader(
"C:\\Users\\user\\fypworkspace\\TextCrawler\\abc.txt");
BufferedReader textReader = new BufferedReader(fr);
// for each URL, process the URL and render the HTML file
int numberofURL = 10;
String[] URL = new String[numberofURL];
int a;
// For each URL, assign one text file to store the contents
// for each URL, extract the URL contents
for (a = 0; a < numberofURL; a++) {
for (int i = 0; i < numberofURL; i++) {
URL[a] = textReader.readLine();
try{
try {
try {
// Render the text from the HTML file
String sourceUrlString = URL[a];
if (args.length == 0)
System.err.println("Using argument of \""
+ sourceUrlString + '"');
else
sourceUrlString = args[0];
if (sourceUrlString.indexOf(':') == -1)
sourceUrlString = "file:" + sourceUrlString;
Source source = new Source(new URL(sourceUrlString));
String renderedText = source.getRenderer()
.toString();
System.out
.println("\nSimple rendering of the HTML document:\n");
System.out.println(renderedText);
// Write the rendered text to a text file
String filename = ("abc" + i + ".txt");
Writer output = null;
String text = renderedText;
File file = new File(filename);
output = new BufferedWriter(new FileWriter(file));
output.write(text);
output.close();
System.out.println("Your file has been written");
// Count the number of words available in the
// rendered text.
BufferedReader br = new BufferedReader(
new FileReader(
"C:\\Users\\user\\fypworkspace\\TextRenderer\\abc"
+ i + ".txt"));
String line = "", str = "";
int count = 0;
while ((line = br.readLine()) != null) {
str += line + " ";
}
StringTokenizer st = new StringTokenizer(str);
while (st.hasMoreTokens()) {
@SuppressWarnings("unused")
String s = st.nextToken();
count++;
}
System.out.println("File has " + count + " words.");
} catch (UnknownServiceException ex) {
System.out
.println("The following url cannot be processed");
}
System.out.println("\n");
System.out.println("\n");
System.out.println("\n");
} catch (NullPointerException ex) {
System.out.println("End of URL");
System.exit(0);
}
}catch(IOException ex){
System.out.println("The following url cannot be processed due to the need to login");
}
}
}
}
}
The third class does some text processing stuff and display some datas.
import net.htmlparser.jericho.*;
import java.util.*;
import java.io.*;
import java.net.*;
public class RenderToText {
public static void main(String[] args) throws IOException {
//Starting to write files
FileReader fr = new FileReader(
"C:\\Users\\user\\fypworkspace\\TextCrawler\\abc.txt");
BufferedReader textReader = new BufferedReader(fr);
// for each URL, process the URL and render the HTML file
int numberofURL = 10;
String[] URL = new String[numberofURL];
int a;
// For each URL, assign one text file to store the contents
// for each URL, extract the URL contents
for (a = 0; a < numberofURL; a++) {
for (int i = 0; i < numberofURL; i++) {
URL[a] = textReader.readLine();
try{
try {
try {
// Render the text from the HTML file
String sourceUrlString = URL[a];
if (args.length == 0)
System.err.println("Using argument of \""
+ sourceUrlString + '"');
else
sourceUrlString = args[0];
if (sourceUrlString.indexOf(':') == -1)
sourceUrlString = "file:" + sourceUrlString;
Source source = new Source(new URL(sourceUrlString));
String renderedText = source.getRenderer()
.toString();
System.out
.println("\nSimple rendering of the HTML document:\n");
System.out.println(renderedText);
// Write the rendered text to a text file
String filename = ("abc" + i + ".txt");
Writer output = null;
String text = renderedText;
File file = new File(filename);
output = new BufferedWriter(new FileWriter(file));
output.write(text);
output.close();
System.out.println("Your file has been written");
// Count the number of words available in the
// rendered text.
BufferedReader br = new BufferedReader(
new FileReader(
"C:\\Users\\user\\fypworkspace\\TextRenderer\\abc"
+ i + ".txt"));
String line = "", str = "";
int count = 0;
while ((line = br.readLine()) != null) {
str += line + " ";
}
StringTokenizer st = new StringTokenizer(str);
while (st.hasMoreTokens()) {
@SuppressWarnings("unused")
String s = st.nextToken();
count++;
}
System.out.println("File has " + count + " words.");
} catch (UnknownServiceException ex) {
System.out
.println("The following url cannot be processed");
}
System.out.println("\n");
System.out.println("\n");
System.out.println("\n");
} catch (NullPointerException ex) {
System.out.println("End of URL");
System.exit(0);
}
}catch(IOException ex){
System.out.println("The following url cannot be processed due to the need to login");
}
}
}
}
}
I just wanted to combine this three classes into one GUI with 3 buttons to call this classes and display their output as the outputs in console. I have no experience in GUI so a through advice would be appreciated. Thanks.

New Topic/Question
Reply




MultiQuote








|