//jleblanc 5.3.06 //PAC: C crawler ///////////////////////////////////////////////////////////////// //DESCRIPTION //From console line the program takes in a seedfile //It then allows the user to search for words in the crawled files //Provides file occurance information for words //Outputs search trees to: vocabulary.txt, emails.txt, and textfiles.txt (textfiles is actually a hashtable) /////////////////////////////////////////////////////////////////// //NOTES //Has rudimentry removal of non letter and number characters from the end of words //Doesn't remove non letter or Number characters from front or interior of words //Will choke and crash if any crawled files have words with more than 200 characters #include #include #include #define num_bins 50 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //STRUCTURES ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //boolean type typedef enum {FALSE, TRUE} boolean; //Queue Node struct qnode { int count; char *w; struct qnode *next; }; typedef struct qnode qnode; //Queue struct queue { int count; qnode *top; qnode *rear; }; typedef struct queue queue; //List Queue struct lnode { int count; char *w; struct lnode *next; }; typedef struct lnode lnode; //Tree Node struct tnode { char *w; lnode *flist; struct tnode *left; struct tnode *right; }; typedef struct tnode tnode; //Tree struct tree { tnode *top; }; typedef struct tree tree; //Hash Table struct hashtable { int number_bins; struct tree hash[num_bins]; }; typedef struct hashtable hashtable; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //FUNCTION DECLARATIONS /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //tree functions boolean tempty(tree t); boolean tfull (tree t); void tinit(tree *t); tnode* tadd(tree *t, char word[], char *filename); tnode* tnadd(tnode *n, char word[], char *filename); void create_tnode(tnode *p, char word[], char *filename); void addtlist(tnode *tn, char *filename); void tprint(FILE *fp,tree *t); void tnprint(FILE *fp, tnode *tn); tnode* tsearch(char word[],tree *t); tnode* tnsearch(char word[], tnode *tn); void sortflist(tnode *tn); void findanddisplay(char word[], tree *t); void tnprintscrn(tnode *tn); void tprint_flist_scrn(tnode *tn); //hashtable functions int hashfunction(char input[]); void init_hash(hashtable *h); void print_hash(char file[], hashtable *h); tnode* find_hash(char target[], hashtable *h); //Sorting Functions int word_type(char word[]); int is_email(char word[]); int is_htm(char word[]); int is_txt(char word[]); void distribute(char *filename, FILE *fp, hashtable *h, tree *t_w, tree *t_e, tree *t_h, tree *t_txts, queue *q); //Queue Functions boolean qempty(const queue q); boolean qfull (const queue q); void qinit ( queue *q ); void qadd(char *c, queue *q ); char* qremove(queue *q ); void qprint(FILE *fp, queue *q); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //MAIN FILE ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void main() { //Variables and structures FILE *fp=NULL; char *holdfile; int hashindex; tnode *filep; hashtable txts; tree htmls; tree words; tree emails; tree texts; queue filequeue; char searchword[200]; int control=1; //Names of the source and output files char seedfile[] = "datain.txt"; //Note that user is prompted to enter this char output_txt[] = "textfiles.txt"; char output_html[] = "htmls.txt"; char output_words[] = "vocabulary.txt"; char output_emails[] = "emails.txt"; char file_queue[] = "filequeue.txt"; //Initialize the Hash Table and Trees init_hash(&txts); tinit(&htmls); tinit(&words); tinit(&emails); tinit(&texts); qinit(&filequeue); //load root file into queue and hashtable printf("Welcome to the C File Crawler\n\n"); while(fp==NULL) { printf("Please enter a seedfile: "); fflush(stdin); //This prevents a "bounce" in the command line scanf ("%s", &seedfile); fp = fopen(seedfile, "r"); if(fp==NULL) { printf("\nThe seedfile you entered could not be found\n"); } else { fclose(fp); } } printf("\n\nProcessing Files.....\n\n"); hashindex=hashfunction(seedfile); //Get hashindex of seedfile filep=tadd(&(txts.hash[hashindex]), seedfile, NULL); //Add seedfile to hashtable tree qadd(filep->w, &filequeue ); //Add pointer to filename to the queue //Build index structure while(! qempty(filequeue)) { holdfile=qremove(&filequeue); fp = fopen(holdfile, "r"); if(fp != NULL) { distribute(holdfile, fp, &txts, &words, &emails, &htmls, &texts, &filequeue); fclose(fp); } } //Enable searching the structure while(control==1) { printf("enter 1 to search for a word or 9 to exit:"); fflush(stdin); //This prevents a "bounce" in the command line scanf("%d", &control); if(control==1) { printf("enter search >> "); fflush(stdin); //This prevents a "bounce" in the command line scanf ("%s", &searchword); printf("\n\n"); //search and display to screen findanddisplay(searchword, &words); printf("\n"); //Offset next line } } //Print out the contents of the Hash Table and Trees to file print_hash(output_txt,&txts); fp = fopen(output_html, "w"); tprint(fp, &htmls); fclose(fp); fp = fopen(output_words, "w"); tprint(fp, &words); fclose(fp); fp = fopen(output_emails, "w"); tprint(fp, &emails); fclose(fp); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //FUNCTIONS ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //SORTING FUNCTIONS////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void distribute(char *filename, FILE *fp, hashtable *h, tree *t_w, tree *t_e, tree *t_h, tree *t_txts, queue *q) { char hold[200]; int hashindex; int kind; int L; tnode *narc; while ( !feof( fp ) ) { if(fscanf (fp,"%s", &hold) != EOF) { //This set of commands eliminates non letter or numbers from terminating a word by replacing these with the null character //effectively truncating the word L=strlen(hold); L=L-1; while( L>0 && !( 'A' <= hold[L] && 'Z' >= hold[L]) && !( 'a' <= hold[L] && 'z' >= hold[L])&& !( '0' <= hold[L] && '9' >= hold[L]) ) { hold[L]=0; L=L-1; } kind=word_type(hold); //if(kind==0)//word //{ narc=tadd(t_w, hold, filename); //I have decided to add all words to the word tree //} if(kind==1)//email { narc=tadd(t_e, hold, filename); } else if(kind==2)//htm/html { narc=tadd(t_h, hold, filename); } else if(kind==3)//txt { //Add to tree narc=tadd(t_txts, hold, filename); //Add into Crawl Control hashindex=hashfunction(hold); //calculate hashindex if(find_hash(hold, h)==NULL && (hashindex>-1 && hashindex < h->number_bins)) { narc=tadd(&h->hash[hashindex], hold, filename); //Add to hashtable and queue qadd(narc->w, q); //HERE the Q should get pointer to narc... } } } } }//END distribute int word_type(char word[]) { //0 word, 1 email, 2 htm/html, 3 txt int type=0; if( is_email(word) == 1) { type=1;} else if( is_htm(word) == 1) { type=2;} else if( is_txt(word) ==1) { type=3;} return type; }//END word_type int is_email(char word[]) { int is_it=-1; char *a, *d; a=strchr(word, '@'); if(a!=NULL) {d=strchr(a, '.');} if(a!=NULL && d!=NULL) { is_it=1; } return is_it; }//END is_email //This picks up html pages too int is_htm(char word[]) { int is_it = -1; char *h; h = strstr(word, ".htm"); if (h != NULL) { is_it = 1; } return is_it; }//END is_htm int is_txt(char word[]) { int is_it=-1; char *h; h=strstr(word, ".txt"); if(h!=NULL) { is_it=1; } return is_it; }//END is_txt ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //QUEUE FUNCTIONS ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// boolean qempty(const queue q) { return ((boolean) (q.top == NULL)); } boolean qfull (const queue q) { return FALSE; } void qinit ( queue *q ) { q -> count = 0; q -> top = NULL; q->rear = NULL; } void qadd(char *c, queue *q ) { qnode *p; if (! qfull(*q)) { p = malloc(sizeof(qnode)); //Request the space, put address in p p->w=c; //Set W to input pointer p->next=NULL; //Set new node's pointer to NULL since at the end if(q->rear != NULL) { q->rear->next=p; //set the current rear node's next to the new node (only if current rear exsists) } else { q->top=p; // if rear is NULL then there are no nodes, so top must be set to the new node } q->rear=p; //set the queue's rear to the new rear q -> count ++; //increment the 'count' field } }//END qadd char* qremove(queue *q ) { qnode *p; char *hold; hold=NULL; if (! qempty(*q)) { hold=q->top->w; //Pass the pointer value to hold p=q->top; //p points to first node q->top=p->next; //top points to the next node free(p); //delete the node q->count--; //decrement the count field if(q->top==NULL) { q->rear=NULL; //if top points to NULL then there are no nodes so rear points to NULL also } } return hold; }//END qremove void qprint(FILE *fp, queue *q) //NOTE this is a destructive print { char *hold; while(! qempty(*q)) { hold=qremove(q); fprintf(fp,"\t%s\n", hold); } }//END qprint ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///TREE FUNCTIONS ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// boolean tempty(tree t) { return ((boolean) (t.top == NULL)); } boolean tfull (tree t) { return FALSE; } void tinit(tree *t) { t->top=NULL; } //returns address of newest tnode if added, or NULL if nothing added tnode* tadd(tree *t, char word[], char *filename) { tnode *filepointer=NULL; if(tempty(*t) == TRUE) { t->top = malloc(sizeof(tnode)); //Allocate tnode space create_tnode(t->top, word, filename); //fill tnode space filepointer=t->top; //Set filepointer to new tnode } else { filepointer=tnadd(t->top, word, filename);} return filepointer; //NULL }//END tadd tnode* tnadd(tnode *n, char word[], char *filename) { int direction; tnode *filepointer=NULL; direction=strcmp(word, n->w); //for strcmp(A,B), 0:A=B, -1:AB if(direction==0) { addtlist(n, filename); //Update flist } else { if( direction < 0) { if(n->left == NULL) { n->left = malloc(sizeof(tnode)); //Allocate tnode space create_tnode(n->left, word, filename); //fill tnode space filepointer=n->left; } else {filepointer=tnadd(n->left, word, filename);} } else { if(n->right == NULL) { n->right = malloc(sizeof(tnode)); //Allocate tnode space create_tnode(n->right , word, filename); //fill tnode space filepointer=n->right; } else {filepointer=tnadd(n->right, word, filename);} } } return filepointer; }//END tnadd void create_tnode(tnode *p, char word[], char *filename) { p->w = malloc (strlen(word)); //Allocate word space strcpy(p ->w, word); //Copy word into word space p->right=p ->left=NULL; //Set children to NULL p ->flist = NULL; //Set flist pointer to NULL addtlist(p, filename); //Update flist }//END create_tnode void addtlist(tnode *tn, char *filename) { lnode *hold; int create=1; //If 1 it means insert new node, if 0 don't insert if(filename != NULL) { if(tn->flist != NULL) { if( strcmp(filename, tn->flist->w) ==0) { tn->flist->count++; create=0; } //If filenames equal increment } if(create==1) { hold=tn->flist; //Hold list pointer tn->flist=malloc(sizeof(lnode)); //Allocate space for node tn->flist->count=1; //Set count to 1 tn->flist->w=filename; //Point w to the filename location tn->flist->next=hold; //Set new node next to old t->list } } }//END addtlist void tprint_flist(FILE *fp, tnode *tn) //will we need a file pointer? { lnode *p; sortflist(tn); if(tn->flist != NULL) { p=tn->flist; while(p != NULL) { fprintf(fp, "\t\t %d :: %s\n", p->count, p->w); p=p->next; } } else { fprintf(fp, "\t\t No filelist was created\n"); } } void tprint_flist_scrn(tnode *tn) //will we need a file pointer? { lnode *p; sortflist(tn); if(tn->flist != NULL) { p=tn->flist; while(p != NULL) { printf("\t\t %d :: %s\n", p->count, p->w); p=p->next; } } else { printf("\t\t No filelist was created\n"); } } void sortflist(tnode *tn) { int num=0; lnode *p=tn->flist; lnode *f=tn->flist; lnode *b=tn->flist; char *wordhold; int counthold; while(p != NULL) { num++; p=p->next; } for(int i=0; inext!=NULL) { f=b->next; if(b->count < f->count) { wordhold=b->w; counthold=b->count; b->count=f->count; b->w=f->w; f->count=counthold; f->w=wordhold; } b=b->next; } } } void tprint(FILE *fp,tree *t) { if(tempty(*t)==TRUE) { fprintf(fp, "tree is empty\n");} else { tnprint(fp, t->top);} } void tnprint(FILE *fp, tnode *tn) { //fprintf(fp,"\t%s \n", tn->w); //tprint_flist(fp, tn); if(tn->left != NULL) { tnprint(fp, tn->left );} fprintf(fp,"\t%s \n", tn->w); tprint_flist(fp, tn); if(tn->right !=NULL) {tnprint(fp, tn->right );} } void findanddisplay(char word[], tree *t) { tnode *result; result=tsearch(word, t); if(result !=NULL) { tnprintscrn(result); } else { printf("\t\t no record of this word\n"); } } void tnprintscrn(tnode *tn) { printf("\t%s \n", tn->w); tprint_flist_scrn(tn); } //Return a pointer to the node if found tnode* tsearch(char word[],tree *t) { tnode *result; if(tempty(*t)==TRUE) { result=NULL;} else { result=tnsearch(word, t->top); } return result; } tnode* tnsearch(char word[], tnode *tn) { int direction; tnode *result=NULL; direction=strcmp(word, tn->w); if(direction==0) {result=tn;}//Means found else if(direction<0) { if( tn->left != NULL) { result=tnsearch(word, tn->left );} else {result=NULL;} } else if(direction>0) { if( tn->right != NULL) { result=tnsearch(word, tn->right );} else {result=NULL;} } return result; }//end find_node ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //Hash Functions ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int hashfunction(char input[]) { return (int) strlen(input); } void init_hash(hashtable *h) { h->number_bins=num_bins; for(int i=0; inumber_bins; i++) { tinit(&h->hash[i]);} } void print_hash(char file[], hashtable *h) { FILE *fp; fp = fopen(file, "w"); fprintf(fp,"The Start\n"); for(int i=0; i < h->number_bins; i++) { if(!tempty(h->hash[i])) { fprintf(fp,"\nIn hash bin %d:\n", i); tprint(fp, &h->hash[i]); } } fprintf(fp,"\n The End\n\n"); fclose(fp); } tnode* find_hash(char target[], hashtable *h) { int index; tnode *result=NULL; index=hashfunction(target); if(index>-1 && index < h->number_bins) {result=tsearch(target, &h->hash[index]);} return result; }