//jleblanc 5.3.06 //PAC: Java crawler //REQUIRES: Queue.java, OccuranceData.java, WordData.java ///////////////////////////////////////////////////////////////// //DESCRIPTION //From console line the program takes in seedfile //It then allows the user to search for words in the crawled files //Provides file occurance information for words //Outputs search trees to: vocabulary.txt, emails.txt, and textfiles.txt /////////////////////////////////////////////////////////////////// //NOTES //All words converted to Lowercase //Periods attached to the end of words are removed // @ and / are considered valid to exsist in words //This site was helpful to me in writing this: www.faqs.org/docs/javap/c12/s4.html import java.io.*; import java.util.*; public class Crawler { //Global Storage Trees TreeMap wordtree = new TreeMap(); TreeMap emailtree= new TreeMap(); TreeMap txttree = new TreeMap(); //Data Structures used to faciliate crawl HashMap hashtable = new HashMap(); Queue fileQ = new Queue(); //Filelist Variables int currentfileref=0; ArrayList filelist= new ArrayList(); //Variable to check if file exsisted or not //Used to ensure that the seedfile exsisted boolean file_real=false; //true if file opened, false if it didn't //Creates and runs the application public static void main(String[] args) { Crawler app = new Crawler(); app.run(); } //Functioning Main progam public void run() { //Variables Local to run() String control= new String("1"); //Used to control user menu String searchword= new String(); //Contains User searchword WordData searchreturn= new WordData("empty", 0); //Gets search result String input_file=new String(); //Used to store User entered seedfile String curfile = new String(); //Holds current file being processed by crawl try { //Create the Console system InputStreamReader reader = new InputStreamReader(System.in); BufferedReader console = new BufferedReader(reader); //Load the Seedfile into System From User System.out.println("Welcome to the crawler\n"); //Ask for seedfile untill it exsists while(! file_real) { System.out.println("Enter SeedFile: "); input_file = console.readLine(); tryopen(input_file); if(file_real==false) System.out.println("The SeedFile you entered couldn't be found"); else System.out.println("\nProcessing Files.....\n\n"); } fileQ.enqueue(input_file); hashtable.put(input_file,input_file); //Crawl while(!fileQ.isEmpty()) { curfile = (String) fileQ.dequeue(); //Get File to Process currentfileref++; //Increment file reference number filelist.add(new String(curfile)); //Add File name to List loadtree(curfile); //Process File } //Let User search for words while(!control.equals("9")) { System.out.println("Enter 1 to search for a word, or 9 to exit"); control = console.readLine(); if(control.equals("1")) { System.out.println(" Enter search word"); System.out.print(" "); searchword = console.readLine(); searchreturn=(WordData) wordtree.get(searchword); if(searchreturn != null) { System.out.println(" "+"found: "+searchreturn.word); searchreturn.Swrite_occurance(filelist); } else { System.out.println(" The word was not found in any files processed"); } } System.out.println(); } System.out.println("The search trees are being printed to file."); System.out.println("Goodbye."); //Write Out the Trees to Text Files write_to_file(wordtree,"data/vocabulary.txt"); write_to_file(emailtree,"data/emails.txt"); write_to_file(txttree,"data/textfiles.txt"); } catch(IOException e) { System.out.println(e); System.exit(1); } }//END run public void write_to_file(TreeMap t, String file) { try { //Print out the contents of the tree //FileWriter w = new FileWriter(file); //NOTE: I use the Buffered reader so I can get newLine() because +"\n" //Was printing a box and not moving to a newline BufferedWriter w = new BufferedWriter(new FileWriter(file)); Iterator it = t.values().iterator(); while (it.hasNext()) { WordData data = (WordData) it.next(); w.write(data.word); //Write word w.newLine(); data.Fwrite_occurance(w, filelist); //Write Occurance Information w.newLine(); w.newLine(); } w.close(); } catch(IOException e) { System.out.println(e); System.exit(1); } }//END write_to_file public void tryopen(String file) { try { BufferedReader r = new BufferedReader(new FileReader(file)); file_real=true; } catch (IOException e) { file_real=false; } }//END tryopen public void loadtree(String file) { try { //Open input_file BufferedReader r = new BufferedReader(new FileReader(file)); //Create a Tokenizer on the file StreamTokenizer st = new StreamTokenizer(r); //Specifies that @ and / can be part of a word so that we can get emails //and subdirectories st.wordChars((int)'@',(int)'@'); st.wordChars((int)'/',(int)'/'); //Load the structures while (st.nextToken() != st.TT_EOF) { file_real=true; if (st.ttype == st.TT_WORD) { //Convert word to all lowercase st.sval=st.sval.toLowerCase(); //Add to wordtree, //NOTE here we get rid or periods attached to words at end of sentences if(st.sval.endsWith(".")==true) addtree(wordtree, st.sval.substring(0,st.sval.length()-1), currentfileref); else addtree(wordtree, st.sval, currentfileref); //If value is a textfile add to txttree and Hashtable + FileQ if not present if(st.sval.endsWith(".txt")==true) { //if textfile isn't in hashtable add it to hashtable and queue if(hashtable.containsValue(st.sval)==false) { hashtable.put(st.sval,st.sval); fileQ.enqueue(st.sval); } addtree(txttree, st.sval, currentfileref); } //If value contains @ and . after 1st character, add to emailtree if(st.sval.indexOf((int) '@')>0 && st.sval.indexOf((int) '.')>0) { addtree(emailtree, st.sval, currentfileref); } } } } //I comment out errors because we will certainly try to open textfiles //which do not exsist + we don't want that to stop the crawl catch (IOException e) { //System.err.println(e); //System.exit(1); } }//END loadtree void addtree(TreeMap t, String input, int fileref) { WordData data = (WordData) t.get(input); if (data == null) { t.put(input, new WordData(input, fileref) ); } else { data.increment(fileref); } }//END loadtree }//End System7