Created
March 28, 2014 18:58
-
-
Save anonymous/9840380 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Lab1; | |
import java.io.*; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
public class Lab1 { | |
public static int searchStopword(String[] stopList, String key) | |
{ | |
int lowest_pos = 0; // Position of the lowest or first element in the array. | |
int highest_pos = stopList.length-1; // Position of the highest or last element in the array. | |
while(lowest_pos <= highest_pos) | |
{ | |
// key must be an element in the range of positions lowest_pos to highest_pos in the array, or it is not present. | |
int middle_pos = lowest_pos + (highest_pos - lowest_pos)/2; | |
int result = key.compareTo(stopList[middle_pos]); | |
if(result < 0){ | |
highest_pos = middle_pos - 1; | |
}else{ | |
if(result > 0){ | |
lowest_pos = middle_pos + 1; | |
// return lowest_pos; | |
}else{ // The word evaluated was found in the stopList, which means that it is a stop word. | |
return middle_pos; // Returning any number that is not -1 will work because when this function is called, the comparison will be made with the value -1. | |
} | |
} | |
} | |
return -1; // The word evaluated was not found in the stopList, which means that it is NOT a stop word. | |
} | |
public static void main(String[] args){ | |
String[] stopList = {"a","is","in","so","of","at","the","to","and","it","as","be","are"}; | |
File folder = new File("/Users/jaimemontoya/Desktop/Lab1_Data"); // Folder that contains the .txt files. | |
File[] listOfFiles = folder.listFiles(); | |
String[] docs = new String[folder.listFiles().length]; // Variable that will store all of the documents. "folder.listFiles().length" is used to specify the number of elements that the array will contain, which will be the number of files contained in the folder, since that is what "folder.listFiles().length" returns as an integer. | |
int i = 0; // Variable to record the iterations. | |
ArrayList<String> termList; | |
termList = new ArrayList<String>(); | |
ArrayList<String> docLists; | |
docLists = new ArrayList<String>(); | |
for (File file : listOfFiles) { // If the folder has 5 files, the loop will make 5 iterations. | |
if (file.isFile()) { | |
FileInputStream filecontent = null; | |
try { | |
filecontent = new FileInputStream(file); // Obtain input bytes from the current .txt file. | |
int content; | |
while ((content = filecontent.read()) != -1) { // Read bytes of data from the current .txt file input stream until the end of the file is reached, which is when read() returns -1. | |
// convert to char and display it | |
// System.out.print((char) content); | |
docs[i] += (char) content; // 1. Assign a unique id to each text document, i.e., 1-5. DONE! docs[0] will contain the first document, docs[1] will be the second document,... | |
} | |
String[] tokens = docs[i].split("[ .&%$#!/+-:?\"();]"); // 2. Read in the text in each document and perform tokenization. Treat punctuation (e.g., “. & % $ # ! /”), symbols (e.g., “+-*/”), and spaces as delimiters. DONE! | |
Arrays.sort(stopList); | |
for(String token:tokens){ // token stores the word evaluated on each iteration. | |
if(searchStopword(stopList, token)==-1){ // 3. Adopt a proper data structure to store the given stop word list and use it to efficiently remove all the stop words in the documents. DONE! | |
//System.out.println("Current token: "+token); | |
if(!termList.contains(token)){ // The token/term was not found in the dictionary. | |
termList.add(token); // Add token/term to the dictionary. | |
docLists.add(String.valueOf(i)); // Add posting for the token/term that was just added in the previous line of code. | |
}else{ // The token/term already appears in the dictionary, update the posting for the token/term. | |
// System.out.println("The term/token ||| "+token+" ||| was found again on document "+i+"!"); | |
int index = termList.indexOf(token); | |
String newvalue = docLists.get(index) + i; | |
newvalue = newvalue.replaceAll("(.)\\1{1,}", "$1"); // Remove duplicates. For example, if a term/token appears 4 times in document 5, 2 times in document 6, and 1 time in document 7, the original value of the variable would be 5555667. This line of code will transform 5555667 into 567, which means that the term/token appeared in the documents 5, 6 and 7. Patter explanation for the regular expression: "(.)\\1{1,}" means any character (added to group 1) followed by itself at least once. "$1" references contents of group 1. Source: http://stackoverflow.com/questions/19730522/remove-repeated-characters-in-a-string | |
docLists.set(index, newvalue); | |
// System.out.println("The word "+token+" has this index: "+index); | |
} | |
} | |
} | |
//System.out.print("-----------------------------------END OF CURRENT DOCUMENT "+i+"-------------------------------------------------------------"); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} finally { | |
try { | |
if (filecontent != null) | |
filecontent.close(); | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
} | |
} | |
i += 1; // Counter of iterations. | |
} | |
int counter = 0; | |
for(String term:termList){ | |
System.out.println("THE TERM " + term + " APPEARS IN THE DOCUMENT(S) NUMBER: " + docLists.get(counter)); // 5. All the remaining tokens will be treated as terms in the dictionary. Documents that contain the term should appear in the postings list of the term. DONE! | |
counter += 1; | |
} | |
Stemmer st = new Stemmer(); | |
st.add("thinker".toCharArray(),"11111".length()); // The result will have a length of 5 characters: think | |
st.stem(); // 4. Call Porter’s stemmer to perform stemming. DONE! | |
System.out.println("Stemmed: " + st.toString()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment