Skip to content

Instantly share code, notes, and snippets.

@akesling
Forked from cdfox/preprocess.go
Last active December 15, 2015 21:49

Revisions

  1. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess.go
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@ import (
    func main() {
    if len(os.Args) < 4 {
    fmt.Printf("Too few arguments. Usage: %s inputFile stopwordFile outputFile \n", os.Args[0])
    os.Exit(2)
    os.Exit(1)
    }

    inFile, err := os.Open(os.Args[1])
  2. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess.go
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@ import (
    func main() {
    if len(os.Args) < 4 {
    fmt.Printf("Too few arguments. Usage: %s inputFile stopwordFile outputFile \n", os.Args[0])
    return
    os.Exit(2)
    }

    inFile, err := os.Open(os.Args[1])
  3. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess.go
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,7 @@ import (

    func main() {
    if len(os.Args) < 4 {
    fmt.Println("Too few arguments. Usage: preprocess IN_FILE STOPWORD_FILE OUT_FILE")
    fmt.Printf("Too few arguments. Usage: %s inputFile stopwordFile outputFile \n", os.Args[0])
    return
    }

  4. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -61,7 +61,7 @@ func main() {
    for {
    line, r_err := reader.ReadString('\n')
    if r_err != nil && r_err != io.EOF {
    panic(err)
    panic(r_err)
    }

    nonalphanumeric, err := regexp.Compile(`\W`)
    @@ -83,7 +83,6 @@ func main() {
    if len(filtered) > 0 {
    csv := strings.Join(filtered, ",")

    // write a line
    if _, err := writer.WriteString(csv + "\n"); err != nil {
    panic(err)
    }
  5. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -43,6 +43,7 @@ func main() {

    word := strings.TrimSpace(line)
    stopwords[word] = true

    if err == io.EOF {
    break
    }
  6. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -40,6 +40,7 @@ func main() {
    if err != nil && err != io.EOF {
    panic(err)
    }

    word := strings.TrimSpace(line)
    stopwords[word] = true
    if err == io.EOF {
  7. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -25,16 +25,15 @@ func main() {
    panic(err)
    }
    defer inFile.Close()

    reader := bufio.NewReader(infile)

    stopwordFile, err := os.Open(os.Args[2])
    if err != nil {
    panic(err)
    }
    defer stopWordFile.Close()

    stopwordReader := bufio.NewReader(stopwordfile)

    stopwords := make(map[string]bool)
    for {
    line, err := stopwordReader.ReadString('\n')
    @@ -53,7 +52,6 @@ func main() {
    panic(err)
    }
    defer outFile.Close()

    writer := bufio.NewWriter(outfile)

    // remove nonalphanumeric characters, lowercase,
  8. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 8 additions and 8 deletions.
    16 changes: 8 additions & 8 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -20,24 +20,24 @@ func main() {
    return
    }

    infile, err := os.Open(os.Args[1])
    inFile, err := os.Open(os.Args[1])
    if err != nil {
    panic(err)
    }
    defer infile.Close()
    defer inFile.Close()

    reader := bufio.NewReader(infile)

    stopwordfile, err := os.Open(os.Args[2])
    stopwordFile, err := os.Open(os.Args[2])
    if err != nil {
    panic(err)
    }
    defer stopwordfile.Close()
    defer stopWordFile.Close()

    stopwordreader := bufio.NewReader(stopwordfile)
    stopwordReader := bufio.NewReader(stopwordfile)
    stopwords := make(map[string]bool)
    for {
    line, err := stopwordreader.ReadString('\n')
    line, err := stopwordReader.ReadString('\n')
    if err != nil && err != io.EOF {
    panic(err)
    }
    @@ -48,11 +48,11 @@ func main() {
    }
    }

    outfile, err := os.Create(os.Args[3])
    outFile, err := os.Create(os.Args[3])
    if err != nil {
    panic(err)
    }
    defer outfile.Close()
    defer outFile.Close()

    writer := bufio.NewWriter(outfile)

  9. Alex Kesling revised this gist Apr 6, 2013. 1 changed file with 6 additions and 17 deletions.
    23 changes: 6 additions & 17 deletions preprocess.go
    Original file line number Diff line number Diff line change
    @@ -20,25 +20,20 @@ func main() {
    return
    }

    // open input file
    infile, err := os.Open(os.Args[1])
    if err != nil {
    panic(err)
    }
    // close infile on exit and check for its returned error
    defer func() {
    if infile.Close() != nil {
    panic(err)
    }
    }()
    // make a read buffer
    defer infile.Close()

    reader := bufio.NewReader(infile)

    // build stopword set
    stopwordfile, err := os.Open(os.Args[2])
    if err != nil {
    panic(err)
    }
    defer stopwordfile.Close()

    stopwordreader := bufio.NewReader(stopwordfile)
    stopwords := make(map[string]bool)
    for {
    @@ -53,18 +48,12 @@ func main() {
    }
    }

    // open output file
    outfile, err := os.Create(os.Args[3])
    if err != nil {
    panic(err)
    }
    // close outfile on exit and check for its returned error
    defer func() {
    if outfile.Close() != nil {
    panic(err)
    }
    }()
    // make a write buffer
    defer outfile.Close()

    writer := bufio.NewWriter(outfile)

    // remove nonalphanumeric characters, lowercase,
  10. @cdfox cdfox revised this gist Apr 6, 2013. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion preprocess.go
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,7 @@
    // For each line of the input file, remove nonalphanumeric characters,
    // lowercase all letters, remove stopwords, and write the result to the output
    // file.
    // file. I used the answer here as a template for reading/writing files:
    // http://stackoverflow.com/questions/1821811/how-to-read-write-from-to-file/9739903#9739903

    package main

  11. @cdfox cdfox revised this gist Apr 6, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess.go
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@
    package main

    import (
    "bufio"
    "bufio"
    "fmt"
    "io"
    "os"
  12. @cdfox cdfox renamed this gist Apr 6, 2013. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  13. @cdfox cdfox created this gist Apr 6, 2013.
    110 changes: 110 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,110 @@
    // For each line of the input file, remove nonalphanumeric characters,
    // lowercase all letters, remove stopwords, and write the result to the output
    // file.

    package main

    import (
    "bufio"
    "fmt"
    "io"
    "os"
    "regexp"
    "strings"
    )

    func main() {
    if len(os.Args) < 4 {
    fmt.Println("Too few arguments. Usage: preprocess IN_FILE STOPWORD_FILE OUT_FILE")
    return
    }

    // open input file
    infile, err := os.Open(os.Args[1])
    if err != nil {
    panic(err)
    }
    // close infile on exit and check for its returned error
    defer func() {
    if infile.Close() != nil {
    panic(err)
    }
    }()
    // make a read buffer
    reader := bufio.NewReader(infile)

    // build stopword set
    stopwordfile, err := os.Open(os.Args[2])
    if err != nil {
    panic(err)
    }
    stopwordreader := bufio.NewReader(stopwordfile)
    stopwords := make(map[string]bool)
    for {
    line, err := stopwordreader.ReadString('\n')
    if err != nil && err != io.EOF {
    panic(err)
    }
    word := strings.TrimSpace(line)
    stopwords[word] = true
    if err == io.EOF {
    break
    }
    }

    // open output file
    outfile, err := os.Create(os.Args[3])
    if err != nil {
    panic(err)
    }
    // close outfile on exit and check for its returned error
    defer func() {
    if outfile.Close() != nil {
    panic(err)
    }
    }()
    // make a write buffer
    writer := bufio.NewWriter(outfile)

    // remove nonalphanumeric characters, lowercase,
    // and remove stopwords for each line
    for {
    line, r_err := reader.ReadString('\n')
    if r_err != nil && r_err != io.EOF {
    panic(err)
    }

    nonalphanumeric, err := regexp.Compile(`\W`)
    if err != nil {
    panic(err)
    }

    alphanumeric := nonalphanumeric.ReplaceAllString(line, " ")
    lowercase := strings.ToLower(alphanumeric)
    tokens := strings.Fields(lowercase)

    filtered := []string{}
    for _, word := range tokens {
    if !stopwords[word] {
    filtered = append(filtered, word)
    }
    }

    if len(filtered) > 0 {
    csv := strings.Join(filtered, ",")

    // write a line
    if _, err := writer.WriteString(csv + "\n"); err != nil {
    panic(err)
    }
    }

    if r_err == io.EOF {
    break
    }
    }

    if err = writer.Flush(); err != nil {
    panic(err)
    }
    }