//jleblanc 5.2.06 (Updated) //PAC //Java crawler (currently class System6) //REQUIRES: Queue.java class authored by Michael Lewis, OccuranceData, WordData2 //From console line takes in seedfile //Allows the user to search for words in the crawled files //All words converted to Lowercase //Provides file occurance information //Outputs: vocabulary.txt, emails.txt, and textfiles.txt //This site was helpful to me in writing this: www.faqs.org/docs/javap/c12/s4.html import java.io.*; import java.util.*; public class System7 { //Global Storage Variables TreeMap wordtree = new TreeMap(); TreeMap emailtree= new TreeMap(); TreeMap txttree = new TreeMap(); HashMap hashtable = new HashMap(); Queue fileQ = new Queue(); //Creates and runs the application public static void main(String[] args) { System7 app = new System7(); app.run(); } //Functioning Main progam public void run() { //Variables Local to run() String control= new String("1"); String searchword= new String(); //String searchreturn= new String(); WordData2 searchreturn= new WordData2("empty", "empty"); String input_file=new String(); try { //Create the Console system InputStreamReader reader = new InputStreamReader(System.in); BufferedReader console = new BufferedReader(reader); //Load the Seedfile System.out.println("Welcome to the crawler"); System.out.println("Enter SeedFile"); input_file = console.readLine(); fileQ.enqueue(input_file); hashtable.put(input_file,input_file); //debatable putting this here. The problem if it isn't here is that //any references to the seedfile will never be added since they are //in the hashtable addtree(wordtree, input_file.toLowerCase(),input_file.toLowerCase()); addtree(txttree, input_file.toLowerCase(),input_file.toLowerCase()); //Crawl while(!fileQ.isEmpty()) { loadtree((String) fileQ.dequeue()); } //Let User search for words while(!control.equals("9")) { System.out.println("Enter 1 to search for a word, or 9 to exit"); control = console.readLine(); if(control.equals("1")) { System.out.println(" Enter search word"); System.out.print(" "); searchword = console.readLine(); searchreturn=(WordData2) wordtree.get(searchword); if(searchreturn != null) { System.out.println(" "+searchreturn.word); searchreturn.Swrite_occurance(); } else { System.out.println(" The word was not found"); } } System.out.println(); } System.out.println("The search trees are being printed to file."); System.out.println("Goodbye."); //Write Out the Trees to Text Files write_to_file(wordtree,"data/vocabulary.txt"); write_to_file(emailtree,"data/emails.txt"); write_to_file(txttree,"data/textfiles.txt"); } catch(IOException e) { System.out.println(e); System.exit(1); } }//END run public void write_to_file(TreeMap t, String file) { try { //Print out the contents of the tree //FileWriter w = new FileWriter(file); //NOTE: I use the Buffered reader so I can get newLine() because +"\n" //Was printing a box and not moving to a newline BufferedWriter w = new BufferedWriter(new FileWriter(file)); Iterator it = t.values().iterator(); while (it.hasNext()) { WordData2 data = (WordData2) it.next(); w.write(data.word); w.newLine(); data.Fwrite_occurance(w); w.newLine(); w.newLine(); } w.close(); } catch(IOException e) { System.out.println(e); System.exit(1); } }//END write_to_file public void loadtree(String file) { try { //Open input_file BufferedReader r = new BufferedReader(new FileReader(file)); //Create a Tokenizer on the file StreamTokenizer st = new StreamTokenizer(r); //Specifies that @ and / can be part of a word so that we can get emails //and subdirectories st.wordChars((int)'@',(int)'@'); st.wordChars((int)'/',(int)'/'); //Load the structures while (st.nextToken() != st.TT_EOF) { if (st.ttype == st.TT_WORD) { //Convert word to all lowercase st.sval=st.sval.toLowerCase(); //Add to wordtree, //NOTE here we get rid or periods attached to words at end of scentences if(st.sval.endsWith(".")==true) addtree(wordtree, st.sval.substring(0,st.sval.length()-1), file); else addtree(wordtree, st.sval, file); //If value is a textfile add to txttree and Hashtable + FileQ if not present if(st.sval.endsWith(".txt")==true) { //if textfile isn't in hashtable add it to hashtable and queue if(hashtable.containsValue(st.sval)==false) { hashtable.put(st.sval,st.sval); fileQ.enqueue(st.sval); } addtree(txttree, st.sval, file); } //If value contains @ and . after 1st character, add to emailtree if(st.sval.indexOf((int) '@')>0 && st.sval.indexOf((int) '.')>0) { addtree(emailtree, st.sval, file); } } } } //I comment out errors because we will certainly try to open textfiles //which do not exsist + we don't want that to stop the crawl catch (IOException e) { //System.err.println(e); //System.exit(1); } }//END loadtree void addtree(TreeMap t, String input, String inputfile) { WordData2 data = (WordData2) t.get(input); if (data == null) { t.put(input, new WordData2(input, inputfile) ); } else { data.increment(inputfile); } }//END loadtree }//End System6