Last active
May 17, 2020 12:14
-
-
Save ak1t0/36072f7662be129752924fbd22be39b9 to your computer and use it in GitHub Desktop.
Kindleのキャンペーンから雑に各作品1巻ずつ抜き出すスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/agnivade/levenshtein" | |
"log" | |
"net/http" | |
"os" | |
"strconv" | |
"time" | |
) | |
func main() { | |
baseUrl := "https://www.amazon.co.jp/b" | |
// campaign := "8134491051" | |
// TODO: use flag | |
if len(os.Args[1:]) < 1 { | |
log.Fatal("campaign id is not found") | |
} | |
campaign := os.Args[1:][0] | |
url := baseUrl + "?node=" + campaign | |
crawl(url) | |
} | |
func crawl(url string) { | |
res, err := http.Get(url) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer res.Body.Close() | |
// use goquery | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// take max page number | |
var max int | |
doc.Find(".pagnDisabled").Each(func(i int, s *goquery.Selection) { | |
number := s.Text() | |
pgn, err := strconv.ParseInt(number, 0, 64) | |
if err != nil { | |
log.Fatal(err) | |
} | |
max = int(pgn) | |
}) | |
// crawl | |
// TODO: use go channel and make concurrent | |
l := make([]string, 99999) | |
for i := 1; i <= max; i++ { | |
target := url + "&page=" + strconv.Itoa(i) | |
res, err := http.Get(target) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// use goquery | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// parse title | |
doc.Find(".a-link-normal.s-access-detail-page.s-color-twister-title-link.a-text-normal").Each(func(i int, s *goquery.Selection) { | |
title := s.Text() | |
var f bool = true | |
for _, v := range l { | |
// TODO: make duplicate product check more correctly | |
d := levenshtein.ComputeDistance(title, v) | |
if d < 8 { | |
f = false | |
} | |
} | |
if f { | |
l = append(l, title) | |
} | |
}) | |
// stop heavy access | |
time.Sleep(100 * time.Millisecond) | |
} | |
// display result | |
for _, v := range l { | |
if v != "" { | |
fmt.Println(v) | |
} | |
} | |
} | |
// SYNOPSYS | |
// go run main.go $campaing_id | |
// $campaign_id is node parameter in URL | |
// min Kindle URL is https://www.amazon.co.jp/b?node=$camapain_id&page=$page |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment