Skip to content

Instantly share code, notes, and snippets.

@ak1t0
Last active May 17, 2020 12:14
Show Gist options
  • Save ak1t0/36072f7662be129752924fbd22be39b9 to your computer and use it in GitHub Desktop.
Save ak1t0/36072f7662be129752924fbd22be39b9 to your computer and use it in GitHub Desktop.
Kindleのキャンペーンから雑に各作品1巻ずつ抜き出すスクリプト
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/agnivade/levenshtein"
"log"
"net/http"
"os"
"strconv"
"time"
)
func main() {
baseUrl := "https://www.amazon.co.jp/b"
// campaign := "8134491051"
// TODO: use flag
if len(os.Args[1:]) < 1 {
log.Fatal("campaign id is not found")
}
campaign := os.Args[1:][0]
url := baseUrl + "?node=" + campaign
crawl(url)
}
func crawl(url string) {
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
// use goquery
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
// take max page number
var max int
doc.Find(".pagnDisabled").Each(func(i int, s *goquery.Selection) {
number := s.Text()
pgn, err := strconv.ParseInt(number, 0, 64)
if err != nil {
log.Fatal(err)
}
max = int(pgn)
})
// crawl
// TODO: use go channel and make concurrent
l := make([]string, 99999)
for i := 1; i <= max; i++ {
target := url + "&page=" + strconv.Itoa(i)
res, err := http.Get(target)
if err != nil {
log.Fatal(err)
}
// use goquery
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
// parse title
doc.Find(".a-link-normal.s-access-detail-page.s-color-twister-title-link.a-text-normal").Each(func(i int, s *goquery.Selection) {
title := s.Text()
var f bool = true
for _, v := range l {
// TODO: make duplicate product check more correctly
d := levenshtein.ComputeDistance(title, v)
if d < 8 {
f = false
}
}
if f {
l = append(l, title)
}
})
// stop heavy access
time.Sleep(100 * time.Millisecond)
}
// display result
for _, v := range l {
if v != "" {
fmt.Println(v)
}
}
}
// SYNOPSYS
// go run main.go $campaing_id
// $campaign_id is node parameter in URL
// min Kindle URL is https://www.amazon.co.jp/b?node=$camapain_id&page=$page
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment