Last active
March 18, 2018 15:46
-
-
Save antsmartian/a74f5e5c9ccc97ffbe0775b9e5f5ce22 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"net/http" | |
"io/ioutil" | |
"golang.org/x/net/html" | |
"bytes" | |
"github.com/PuerkitoBio/goquery" | |
"net/url" | |
"strings" | |
) | |
//Worker for visiting the urls | |
type Worker struct { | |
url string | |
response chan []*url.URL | |
} | |
func (w *Worker) processLinks(doc *goquery.Document) (result []*url.URL) { | |
//Should improve it to filter version. | |
urls := doc.Find("a[href]").Map(func(i int, selection *goquery.Selection) string { | |
val, _ := selection.Attr("href") | |
if strings.Contains(val,"http") { | |
return val | |
} | |
return "" | |
}) | |
for _, s := range urls { | |
if len(s) > 0 && !strings.HasPrefix(s, "#") { | |
if parsed, e := url.Parse(s); e == nil { | |
parsed = doc.Url.ResolveReference(parsed) | |
result = append(result, parsed) | |
} | |
} | |
} | |
return | |
} | |
func (w *Worker) visitUrl() (e error) { | |
var doc *goquery.Document | |
req, e := http.NewRequest("GET",w.url,nil) | |
if e != nil { | |
return e | |
} | |
var HttpClient = &http.Client{} | |
res , _ := HttpClient.Do(req) | |
if res != nil && res.StatusCode == 200 { | |
if bd, e := ioutil.ReadAll(res.Body); e != nil { | |
fmt.Println("Error while reading the body") | |
} else { | |
if node , e := html.Parse(bytes.NewBuffer(bd)); e != nil { | |
fmt.Println("Error parsing the body") | |
} else { | |
doc = goquery.NewDocumentFromNode(node) | |
doc.Url = res.Request.URL | |
} | |
res.Body = ioutil.NopCloser(bytes.NewBuffer(bd)) | |
urls := w.processLinks(doc) | |
w.response <- urls //send response to crawler the list of urls | |
} | |
} else { | |
fmt.Println("Error, status code is not 200") | |
} | |
return | |
} | |
//A Nicer implementation | |
type Crawler struct { | |
url string | |
response chan []*url.URL | |
} | |
//Spwan worker. | |
func (c *Crawler) Run() { | |
c.response = make(chan []*url.URL) | |
worker := Worker{url:c.url,response:c.response} | |
go worker.visitUrl() | |
for { | |
select { | |
//received the urls from worker | |
//iterate and create new worker | |
//All worker run in go routine. | |
case res := <-c.response: | |
for _, element := range res { | |
fmt.Println("Visiting",element) | |
if strings.HasPrefix(element.String(),"http:") { | |
worker := Worker{url:element.String(),response:c.response} | |
go worker.visitUrl() | |
} | |
} | |
break | |
} | |
} | |
} | |
func main() { | |
crawler := & Crawler{ | |
url : "https://en.wikipedia.org/wiki/Groovy", | |
} | |
crawler.Run() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment