//PAC: system takes in source from a datafile + splits it into words, textfiles, emails, and html //jleblanc:3.9.06 //Requires a datain.txt file with words //These examples are a bit fragile (e.g. words can be a max of only 50 characters and //there is no error checking for them if overflow), but work within these current confines. //The hash function is a simple string length function //The word kind checking functions are also simple so red.txt.kljdfl could be interpretted //as a text file /* Extend the hash table assignment ... such that, as words are read, each word is examined and then stored in an appropriate data structure. The word filtering rules are as follows: * words that might be e-mail addresses should be stored in a tree structure "emails" ... and when the program is finished, written to a file "emails.txt"; * words that might be html documents should be stored in a separate tree variable "htmls" ... and later written to a file "htmls.txt"; * words that might be text files (as denoted by the file extension ".txt") should be stored non-redundantly in the hash table (the one that you have used before)... the contents of which should later be written to a file "textfiles.txt"; * all other words should go into a tree variable "wordtree" and later be written to a file "words.txt". Thus, there are four main variables used to store the words ... three trees, and an array of trees (the hash table) ... but the code to manage each is the same, that is, the same add function, the same traverse function. The library "strings.h" should be a big help in writing the functions that filter the words. I'll send separately a diagram that helps put this in a graphical perspective. As noted, individual pieces of our current work will be examined in review sessions within the Sakai system. We'll conduct the first session, to review the features of this assignment, on Sunday afternoon at 4:00 pm. Note that this program is an ongoing exercise, one that we will extend several times in the coming weeks, until it is fully functional. Separately, take some time to install and peruse the assembly langugae environment ... test it with the supplied "hello world" example ... if you are successful with this by Thursday, then we're on schedule for the upcoming work. */ #include #include #include #define num_bins 50 //STRUCTURES//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //boolean type typedef enum {FALSE, TRUE} boolean; //tree node struct tnode { char word[50]; int count; struct tnode *child[2]; }; typedef struct tnode tnode; //tree struct tree { tnode *top; }; typedef struct tree tree; //hash table struct hashtable { int number_bins; struct tree hash[num_bins]; }; typedef struct hashtable hashtable; //FUNCTION DECLARATIONS////////////////////////////////////////////////////////////////////////////////////////////////////////////// //tree functions boolean tempty(tree t); boolean tfull (tree t); void init_tree(tree *t); void add_to_tree(tree *t, char item[]); void add_a_node(tnode *n, char item[]); void print_tree(FILE *fp,tree *t); void print_node(FILE *fp, tnode *n); //hashtable functions int hashfunction(char input[]); void init_hash(hashtable *h); void load_hash(char file[], hashtable *h); void print_hash(char file[], hashtable *h); //Sorting Functions int word_type(char word[]); int is_email(char word[]); int is_htm(char word[]); int is_txt(char word[]); void distribute(char file[], hashtable *h, tree *t_w, tree *t_e, tree *t_h); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //MAIN FILE ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void main() { //Variables and structures FILE *fp; hashtable txts; tree htmls; tree words; tree emails; //Names of the source and output files char source_file[] = "datain.txt"; char output_txt[] = "textfiles.txt"; char output_html[] = "htmls.txt"; char output_words[] = "words.txt"; char output_emails[] = "emails.txt"; //Initialize the Hash Table and Trees init_hash(&txts); init_tree(&htmls); init_tree(&words); init_tree(&emails); //Here we distribute the contents of the source file to the various structures distribute(source_file, &txts, &words, &emails, &htmls); //Print out the contents of the Hash Table and Trees print_hash(output_txt,&txts); fp = fopen(output_html, "w"); print_tree(fp, &htmls); fclose(fp); fp = fopen(output_words, "w"); print_tree(fp, &words); fclose(fp); fp = fopen(output_emails, "w"); print_tree(fp, &emails); fclose(fp); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //FUNCTIONS ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// boolean tempty(tree t) { return ((boolean) (t.top == NULL)); } boolean tfull (tree t) { return FALSE; } void init_tree(tree *t) { t->top=NULL; } //Add elements to the tree///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void add_to_tree(tree *t, char item[]) { if(tempty(*t)==TRUE) { t->top=malloc(sizeof(tnode)); strcpy( t->top ->word, item); //t->top->count=1; (*(*t).top).count=1; t->top->child[0] =NULL; t->top->child[1] =NULL; } else { add_a_node(t->top, item);} } void add_a_node(tnode *n, char item[]) { int direction; direction=strcmp(item, n->word); if(direction==0) { n->count++;} else { if( direction<0) //incoming less, look/add left { direction=0;} else {direction=1;} if(n->child[direction] ==NULL) { n->child[direction] =malloc(sizeof(tnode)); strcpy( n->child[direction] ->word, item); n->child[direction] ->count=1; n->child[direction] ->child[0]=NULL; n->child[direction] ->child[1]=NULL; } else//Recursive call { add_a_node(n->child[direction], item); } } } //Print the tree to a file/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void print_tree(FILE *fp,tree *t) { if(tempty(*t)==TRUE) { fprintf(fp, "tree is empty\n");} else { print_node(fp, t->top);} } void print_node(FILE *fp, tnode *n) { fprintf(fp,"\t%s occuring %d times\n", n->word, n->count); if(n->child[0] != NULL) { print_node(fp, n->child[0] );} if(n->child[1] !=NULL) {print_node(fp, n->child[1] );} } //Hash Function//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int hashfunction(char input[]) { return (int) strlen(input); } //Functions Running the Hash Table//////////////////////////////////////////////////////////////////////////////////////////////////// void init_hash(hashtable *h) { h->number_bins=num_bins; for(int i=0; inumber_bins; i++) { init_tree(&h->hash[i]);} } void load_hash(char file[], hashtable *h) { char hold[50]; int index; FILE *fp; fp = fopen(file, "r"); while ( !feof( fp ) ) { if(fscanf (fp,"%s", &hold) != EOF) { index=hashfunction(hold); if(index>-1 && index < h->number_bins) {add_to_tree(&h->hash[index], hold);} } } fclose(fp); } void print_hash(char file[], hashtable *h) { FILE *fp; fp = fopen(file, "w"); fprintf(fp,"The Start\n"); for(int i=0; i < h->number_bins; i++) { if(!tempty(h->hash[i])) { fprintf(fp,"\nIn hash bin %d:\n", i); print_tree(fp, &h->hash[i]); } } fprintf(fp,"\n The End\n\n"); fclose(fp); } //SORTING FUNCTIONS/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void distribute(char file[], hashtable *h, tree *t_w, tree *t_e, tree *t_h) { char hold[50]; int hashindex; int kind; FILE *fp; fp = fopen(file, "r"); while ( !feof( fp ) ) { if(fscanf (fp,"%s", &hold) != EOF) { //First determine which bin it goes to kind=word_type(hold); if(kind==0)//word { add_to_tree(t_w, hold); } else if(kind==1)//email { add_to_tree(t_e, hold); } else if(kind==2)//htm/html { add_to_tree(t_h, hold); } else if(kind==3)//txt { hashindex=hashfunction(hold); if(hashindex>-1 && hashindex < h->number_bins) {add_to_tree(&h->hash[hashindex], hold);} } } } fclose(fp); }//END distribute int word_type(char word[]) { //0 word, 1 email, 2 htm/html, 3 txt int type=0; if( is_email(word) == 1) { type=1;} else if( is_htm(word) == 1) { type=2;} else if( is_txt(word) ==1) { type=3;} return type; } int is_email(char word[]) { int is_it=-1; char *a, *d; a=strchr(word, '@'); if(a!=NULL) {d=strchr(a, '.');} if(a!=NULL && d!=NULL) { is_it=1; } return is_it; } //This picks up html pages too int is_htm(char word[]) { int is_it=-1; char *h; h=strstr(word, ".htm"); if(h!=NULL) { is_it=1; } return is_it; } int is_txt(char word[]) { int is_it=-1; char *h; h=strstr(word, ".txt"); if(h!=NULL) { is_it=1; } return is_it; }