Lucene Based Search Engine

To Build a Seach File in Lucene using Java

Page 1 of 1

3 Replies - 2172 Views - Last Post: 25 November 2008 - 08:42 PM Rate Topic: -----

#1 sumanghimire  Icon User is offline

  • New D.I.C Head

Reputation: 0
  • View blog
  • Posts: 2
  • Joined: 25-November 08

Lucene Based Search Engine

Posted 25 November 2008 - 07:04 AM

While running this code , I am getting this error. I dont know what could the error be.

Exception in thread "main" java.lang.Error: Unresolved compilation problem:
Syntax error on token "else", delete this token

at searchengine.SearchFiles.main(SearchFiles.java:251)


package searchengine;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.ArrayList;

import java.io.File;
import java.util.HashSet;
import java.util.StringTokenizer;


/** Simple command-line based search demo. */

public class SearchFiles {

/** Use the norms from one field for all fields. Norms are read into memory,
* using a byte of memory per document per searched field. This can cause
* search of large collections with a large number of fields to run out of
* memory. If all of the fields contain only a single token, then the norms
* are all identical, then single norm vector may be shared. */
private static class OneNormsReader extends FilterIndexReader {
private String field;

public OneNormsReader(IndexReader in, String field) {
super(in);
this.field = field;
}

public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}

private SearchFiles() {}

/** Simple command-line based search demo. */

public static void main(String[] args) throws Exception {
String usage =
"Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}

String index = "index";
String field = "contents";
String queries = null;
int repeat = 0;
boolean raw = false;
String normsField = null;

for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i+1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i+1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i+1];
i++;
} else if ("-repeat".equals(args[i])) {
repeat = Integer.parseInt(args[i+1]);
i++;
} else if ("-raw".equals(args[i])) {
raw = true;
} else if ("-norms".equals(args[i])) {
normsField = args[i+1];
i++;
}
}

IndexReader reader = IndexReader.open(index);

if (normsField != null)
reader = new OneNormsReader(reader, normsField);

Searcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();

BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new FileReader(queries));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(field, analyzer);


//CISI.REL - storing query number, rel doc ids


String CISI_FILE_PATH = new String("U:\\IR");

final File CISI_QRY_FILE = new File(CISI_FILE_PATH, "CISI.QRY");
final File CISI_REL_FILE = new File(CISI_FILE_PATH, "CISI.REL");

// Control flags of source file
final String DOC_NUM = ".I";
final String DOC_TIT = ".T";
final String DOC_AUT = ".A";
final String DOC_CONT = ".W";
final String DOC_REF = ".B";

String current_doc_flag = "";

try

{
BufferedReader query_input = new BufferedReader(new FileReader(CISI_QRY_FILE));

String query_line = null, concatenated_query = "";
String ref_line = null;

String query_id = null;
HashSet relDocsHashSet = new HashSet(); // Method-HashSet of relevant documents to query_id
ArrayList prlists = new ArrayList(); // add prlists to here. a prlist has precision/recall values for a query
double precision= 0;
double recall = 0;

// Read Query file
while (( query_line = query_input.readLine()) != null){

if(query_line.startsWith(".")){ //-- Come accross a Flag --//

if(current_doc_flag==DOC_CONT){
//Start of Calculating Precision/Recall//
int totalRelDocsNum = relDocsHashSet.size();

switch(totalRelDocsNum){
case 0: // calculate only when it's not empty
break;
default:
Query query = parser.parse(concatenated_query.trim());
//System.out.println("Searching for: " + query.toString(field));

Hits hits = searcher.search(query);
int relDocsNum= 0;

// System.out.println(hits.length() + " total matching documents");

relDocsNum = 0; // reset the number of relevant docs found
PRList prlist = new PRList(); // Instantiate this object per query
for (int i = 0; i <hits.length(); i++) {


Document doc = hits.doc(i);
String path = doc.get("docNo"); // lab4
if (path != null) {

// lab5; if the document is relevant, increment relevant docs num
if(relDocsHashSet.contains(path)){
relDocsNum++;
}
recall = (relDocsNum)/(double)totalRelDocsNum;
precision = (relDocsNum)/((double)i+1.0);

// the below line; print out precision/recall values
//System.out.println((i+1) + ". " + path +", Recall="+recall+ ", Precision="+precision); // lab5
String title = doc.get("contents");
if (title != null) {
System.out.println(" Title: " + doc.get("contents"));
}

prlist.add(recall, precision); // lab6; add recall, precision to PRList
} else {
System.out.println((i+1) + ". " + "No path for this document");
}
}

if (queries != null) // non-interactive
break;
}
prlists.add(prlist); // lab6; add prlists to here. a prlist has precision/recall values for a query
break;
}
concatenated_query = ""; // empty query
}

//+++++++++++++ End of Calculating Precision/Recall +++++++++++++//

if(query_line.startsWith(DOC_NUM)){ // document number

// take query id from the file
query_id = query_line.substring(DOC_NUM.length(),query_line.length()).trim();


// Start of Reading CISI.REL file//



// As a result, relDocsHashSet will store all relevant documents to the current query
BufferedReader rel_input = new BufferedReader(new FileReader(CISI_REL_FILE));
relDocsHashSet.clear(); // clear HashSet of relevant documents to query_id
boolean query_id_found = false; // this flag indicates if relevant documents to query_id has been found
String query_id_in_rel = null;
while (( ref_line = rel_input.readLine()) != null){
StringTokenizer st = new StringTokenizer(ref_line);
query_id_in_rel = st.nextToken().trim(); // take the first field which is query id

if(query_id_in_rel.equals(query_id)){ // when query_id is found
relDocsHashSet.add(st.nextToken().trim()); // add the relevant document id

query_id_found = true; // set the flag; relevant documents to that query has been found
} else if (query_id_found){ // when query id has already been found in CISI.REL file and now the query id in CISI.REL is different one
break; // get out of loop statement
}
}

// the following source code prints out relevant docs to a query
// Extract elements from iterator.
// Note that the elements may not follow the order in which they
// are added to HashSet.
Iterator iter = relDocsHashSet.iterator();// Retrieve an iterator to the hashset:
System.out.print("Relevant Docs of "+query_id + "= ");
while(iter.hasNext())
System.out.print(iter.next() + ",");
System.out.println("");

rel_input.close(); // close file

//+++++++++++++ End of Reading CISI.REL file +++++++++++++//

current_doc_flag = DOC_NUM; // Reset Current document flag

} else if(query_line.startsWith(DOC_TIT)){ // document title
current_doc_flag = DOC_TIT; // Reset Current document flag
} else if(query_line.startsWith(DOC_AUT)){ // document author
current_doc_flag = DOC_AUT; // Reset Current document flag
} else if(query_line.startsWith(DOC_CONT)){ // document contents
current_doc_flag = DOC_CONT; // Reset Current document flag
} else if(query_line.startsWith(DOC_REF)){ // document references
current_doc_flag = DOC_REF; // Reset Current document flag
}

}else if(query_line.equals("")){ //-- Skip spaces


}else if(current_doc_flag==DOC_CONT){ //-- actual Contents
concatenated_query += query_line + " "; // concatenate query lines

}




query_input.close(); // close input file


//-- Lab6; calculate/print out the result --//
// Interpolate the results of each query
ArrayList ilists = new ArrayList(); // InterpolatedPRList

for (int i = 0; i < prlists.size(); i++) {
ilists.add(((PRList)prlists.get(i)).interpolateList());
}

// average across all queries
InterpolatedPRList result = InterpolatedPRList.average(ilists);

// print out result to plot
System.out.println(result);


// Lab6 //


}catch (Exception e)
{
System.err.println("Error: " + e);
}

//----------- End of Lab5
reader.close();

}
}

Is This A Good Question/Topic? 0
  • +

Replies To: Lucene Based Search Engine

#2 cfoley  Icon User is offline

  • Cabbage
  • member icon

Reputation: 2045
  • View blog
  • Posts: 4,235
  • Joined: 11-December 07

Re: Lucene Based Search Engine

Posted 25 November 2008 - 08:59 AM

Errors like that are often caused by mismatched curly brackets, and they often occur toward the end of the file when the compiler thinks the class should be finished but there's still code there. It's hard to inspect your code because there's no indentation. Putting it in [code ][/code ] tags preserves the correct indentation.

I've not had a thorough look for this reason but I did notice this near the top of your class and it looks like an extra close brace to me:

public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}


Have a look through your code, checking that the curly brackets all line up (indentation really helps here). If you're still having problems, come back with code tags. Hope that helps!

This post has been edited by cfoley: 25 November 2008 - 09:00 AM

Was This Post Helpful? 0
  • +
  • -

#3 sumanghimire  Icon User is offline

  • New D.I.C Head

Reputation: 0
  • View blog
  • Posts: 2
  • Joined: 25-November 08

Re: Lucene Based Search Engine

Posted 25 November 2008 - 09:03 AM

Ya I fix that problem. I am not getting the value what I want in Recall and Precison Table. I am just getting zeros.

package searchengine;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *	 http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.ArrayList;

import java.io.File;
import java.util.HashSet;
import java.util.StringTokenizer;

/** Simple command-line based search demo. */
public class SearchFiles {
	
	/** Use the norms from one field for all fields.  Norms are read into memory,
	 * using a byte of memory per document per searched field.  This can cause
	 * search of large collections with a large number of fields to run out of
	 * memory.  If all of the fields contain only a single token, then the norms
	 * are all identical, then single norm vector may be shared. */
	private static class OneNormsReader extends FilterIndexReader {
		private String field;
		
		public OneNormsReader(IndexReader in, String field) {
			super(in);
			this.field = field;
		}
		
		public byte[] norms(String field) throws IOException {
			return in.norms(this.field);
		}
	}
	
	private SearchFiles() {}
	
	/** Simple command-line based search demo. */
	public static void main(String[] args) throws Exception {
		String usage =
			"Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
		if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
			System.out.println(usage);
			System.exit(0);
		}
		
		String index = "index";
		String field = "contents";
		String queries = null;
		int repeat = 0;
		boolean raw = false;
		String normsField = null;
		
		for (int i = 0; i < args.length; i++) {
			if ("-index".equals(args[i])) {
				index = args[i+1];
				i++;
			} else if ("-field".equals(args[i])) {
				field = args[i+1];
				i++;
			} else if ("-queries".equals(args[i])) {
				queries = args[i+1];
				i++;
			} else if ("-repeat".equals(args[i])) {
				repeat = Integer.parseInt(args[i+1]);
				i++;
			} else if ("-raw".equals(args[i])) {
				raw = true;
			} else if ("-norms".equals(args[i])) {
				normsField = args[i+1];
				i++;
			}
		}
		
		IndexReader reader = IndexReader.open(index);
		
		if (normsField != null)
			reader = new OneNormsReader(reader, normsField);
		
		Searcher searcher = new IndexSearcher(reader);
		Analyzer analyzer = new StandardAnalyzer();
		
		BufferedReader in = null;
		if (queries != null) {
			in = new BufferedReader(new FileReader(queries));
		} else {
			in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
		}
		QueryParser parser = new QueryParser(field, analyzer);
		
		//----------- Start of Lab5 (CISI.REL - store query number, relevant document ids
		String CISI_FILE_PATH = new String("U:\\IR");
		
		final File CISI_QRY_FILE = new File(CISI_FILE_PATH, "CISI.QRY");
		final File CISI_REL_FILE = new File(CISI_FILE_PATH, "CISI.REL");
		
		// control flags of source file
		final String DOC_NUM = ".I";
		final String DOC_TIT = ".T";
		final String DOC_AUT = ".A";
		final String DOC_CONT = ".W";
		final String DOC_REF = ".B";
		
		String current_doc_flag = "";
		
		try
		{
			BufferedReader query_input =  new BufferedReader(new FileReader(CISI_QRY_FILE));
			
			String query_line = null, concatenated_query = "";
			String ref_line = null;
			
			String query_id = null;
			HashSet relDocsHashSet = new HashSet(); // HashSet of relevant documents to query_id
			ArrayList prlists = new ArrayList(); // lab6; add prlists to here. a prlist has precision/recall values for a query
			//double relDocsNum = 0;
			double precision = 0, recall = 0;
			
			// Read Query file
			while (( query_line = query_input.readLine()) != null){
				
				if(query_line.startsWith(".")){ //--	Come accross a Flag		--//

					if(current_doc_flag==DOC_CONT){//+++++++++++++ Start of Calculating Precision/Recall +++++++++++++//
						int totalRelDocsNum = relDocsHashSet.size();

						switch(totalRelDocsNum){
						case 0: // calculate only when it's not empty
							break;
						default:
							Query query = parser.parse(concatenated_query.trim());
							//System.out.println("Searching for: " + query.toString(field));
						
							Hits hits = searcher.search(query);
							int relDocsNum=0;
							//System.out.println(hits.length() + " total matching documents");
							
							 relDocsNum = 0; // reset the number of relevant docs found
							//final int HITS_PER_PAGE = 100000;
							PRList prlist = new PRList(); // lab6; instantiate this object per query
							for (int i = 0; i<hits.length(); i++){
								
									
									Document doc = hits.doc(i);
									String path = doc.get("docNo"); // lab4
									if (path != null) {
										
										// lab5; if the document is relevant, increment relevant docs num
										if(relDocsHashSet.contains(path)){
											relDocsNum++;
										}
										recall = (relDocsNum)/(double)totalRelDocsNum;
										precision = (relDocsNum)/((double) i+1.0);
										
										// the below line; print out precision/recall values
										//System.out.println((i+1) + ". " + path +", Recall="+recall+ ", Precision="+precision); // lab5
										String title = doc.get("contents");
										if (title != null) {
											System.out.println("   Title: " + doc.get("contents"));
										}
										
										prlist.add(recall, precision); // lab6; add recall, precision to PRList
									} else {
										System.out.println((i+1) + ". " + "No path for this document");
									
								}
								
								if (queries != null)					  // non-interactive
									break;
							}
							prlists.add(prlist); // lab6; add prlists to here. a prlist has precision/recall values for a query
							break;
						}
						concatenated_query = ""; // empty query
					}//+++++++++++++ End of Calculating Precision/Recall +++++++++++++//

					if(query_line.startsWith(DOC_NUM)){ // document number
						
						// take query id from the file
						query_id = query_line.substring(DOC_NUM.length(),query_line.length()).trim();
						
						//+++++++++++++ Start of Reading CISI.REL file +++++++++++++//
						// As a result, relDocsHashSet will store all relevant documents to the current query
						BufferedReader rel_input =  new BufferedReader(new FileReader(CISI_REL_FILE));
						relDocsHashSet.clear(); // clear HashSet of relevant documents to query_id
						boolean query_id_found = false; // this flag indicates if relevant documents to query_id has been found
						String query_id_in_rel = null;
						while (( ref_line = rel_input.readLine()) != null){
							StringTokenizer st = new StringTokenizer(ref_line);
							query_id_in_rel = st.nextToken().trim(); // take the first field which is query id
							
							if(query_id_in_rel.equals(query_id)){ // when query_id is found
								relDocsHashSet.add(st.nextToken().trim()); // add the relevant document id
								
								query_id_found = true; // set the flag; relevant documents to that query has been found
							} else if (query_id_found){ // when query id has already been found in CISI.REL file and now the query id in CISI.REL is different one
								break; // get out of loop statement
							}
						}

						//++ the following source code prints out relevant docs to a query
						// Extract elements from iterator.
						// Note that the elements may not follow the order in which they
						// are added to HashSet.
						Iterator iter = relDocsHashSet.iterator();// Retrieve an iterator to the hashset:
						System.out.print("Relevant Docs of "+query_id + "= ");
						while(iter.hasNext())
							System.out.print(iter.next() + ",");
						System.out.println("");

						rel_input.close(); // close file
						//+++++++++++++ End of Reading CISI.REL file +++++++++++++//
						
						current_doc_flag = DOC_NUM; // Reset Current document flag
					} else if(query_line.startsWith(DOC_TIT)){  // document title				  
						current_doc_flag = DOC_TIT; // Reset Current document flag
					} else if(query_line.startsWith(DOC_AUT)){ // document author
						current_doc_flag = DOC_AUT; // Reset Current document flag
					} else if(query_line.startsWith(DOC_CONT)){ // document contents					
						current_doc_flag = DOC_CONT; // Reset Current document flag
					} else if(query_line.startsWith(DOC_REF)){ // document references
						current_doc_flag = DOC_REF; // Reset Current document flag
					}
				}else if(query_line.equals("")){ //--	 Skip spaces		--//

				}else if(current_doc_flag==DOC_CONT){ //--		 actual Contents		--//
					concatenated_query += query_line + " "; // concatenate query lines
				}
			}
			
			query_input.close(); // close input file
			
			
			//--		Lab6; calculate/print out the result		--//
			// interpolate the results of each query
			ArrayList ilists = new ArrayList(); // InterpolatedPRList

			for (int i = 0; i < prlists.size(); i++) {
				ilists.add(((PRList)prlists.get(i)).interpolateList());
			}

			// average across all queries
			InterpolatedPRList result = InterpolatedPRList.average(ilists);

			// print out result to plot
			System.out.println(result);
			//--		Lab6		--//			
			
			
		}catch (Exception e)
		{
			System.err.println("Error: " + e);
		}

		//----------- End of Lab5
		reader.close();
	}
}





[code]



[quote name='cfoley' date='25 Nov, 2008 - 07:59 AM' post='473403']
Errors like that are often caused by mismatched curly brackets, and they often occur toward the end of the file when the compiler thinks the class should be finished but there's still code there. It's hard to inspect your code because there's no indentation. Putting it in [code ][/code ] tags preserves the correct indentation.

I've not had a thorough look for this reason but I did notice this near the top of your class and it looks like an extra close brace to me:

[code]public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}


Have a look through your code, checking that the curly brackets all line up (indentation really helps here). If you're still having problems, come back with code tags. Hope that helps!
[/quote]
Was This Post Helpful? 0
  • +
  • -

#4 pbl  Icon User is offline

  • There is nothing you can't do with a JTable
  • member icon

Reputation: 8342
  • View blog
  • Posts: 31,880
  • Joined: 06-March 08

Re: Lucene Based Search Engine

Posted 25 November 2008 - 08:42 PM

You have an extra { in front of a else statement

} else if(query_line.startsWith(DOC_TIT)){ // document title 
current_doc_flag = DOC_TIT; // Reset Current document flag
} else if(query_line.startsWith(DOC_AUT)){ // document author
current_doc_flag = DOC_AUT; // Reset Current document flag
} else if(query_line.startsWith(DOC_CONT)){ // document contents 
current_doc_flag = DOC_CONT; // Reset Current document flag
} else if(query_line.startsWith(DOC_REF)){ // document references
current_doc_flag = DOC_REF; // Reset Current document flag
}		   // <-------- with this one

}else if(query_line.equals("")){ //-- Skip spaces	<----------------- here
 

}else if(current_doc_flag==DOC_CONT){ //-- actual Contents 


Was This Post Helpful? 0
  • +
  • -

Page 1 of 1