harryhare/exercise-web-crawler.go

diserere · 2022-05-23T00:56:26Z

Solution with waitGroup looks to be out of "A tour of Go" scope, really. And solution with 'time.Sleep' looks disguisting too much ))

The Tutorial explains mutex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.

Here is my solution using chained channels - it's not ideal cause I learn Go about two weeks, but it works and covers (i hope:) these conditions:

package main

import (
	"fmt"
	"sync"
	// "time"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type UrlCache struct {
	mu sync.Mutex
	cache map[string]int
}

func (u *UrlCache) IsFetched(url string) bool {
	u.mu.Lock()
	defer u.mu.Unlock()
	_, fetched := u.cache[url];
	if !fetched {
		u.cache[url]++
	}
	return fetched
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, c UrlCache, out chan string) {
	defer close(out)
	if depth <= 0 {
		return
	}
	if !c.IsFetched(url) {
		body, urls, err := fetcher.Fetch(url)
		if err != nil {
			// out <- err.Error()
			out <- fmt.Sprintf("%v", err)
			return
		}
		out <- fmt.Sprintf("found: %s %q", url, body)
		outs := make([]chan string, len(urls))
		for i, u := range urls {
			outs[i] = make(chan string)
			go Crawl(u, depth-1, fetcher, c, outs[i])
		}
		for _, ch := range outs {
			for msg := range ch {
				out <- msg
			}
		}
	}
}

func main() {
	cache := UrlCache{cache: make(map[string]int)}
	out := make(chan string)
	go Crawl("https://golang.org/", 4, fetcher, cache, out)
	// time.Sleep(time.Second*1)
	for msg := range out {
		fmt.Println(msg)
	}

}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

alg · 2022-08-21T08:54:55Z

Why you guys didn't use sync.Map for the safe cache and instead implemented it all manually?

krapie · 2022-12-13T07:52:42Z

package main

import (
	"fmt"
	"sync"
)

type Cache struct {
	mu sync.Mutex
	urls map[string]bool
}

func (c *Cache) InsertUrls(urls []string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	for _, url := range urls {
		c.urls[url] = false
	}
}

func (c *Cache) UpdateFetchedUrl(url string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	c.urls[url] = true
}

func (c *Cache) IsUrlFetched(url string) bool {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	return c.urls[url]
}

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache *Cache, wg *sync.WaitGroup) {
	defer wg.Done()
	
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	// insert new urls in cache
	// and update url fetched
	cache.InsertUrls(urls)
	cache.UpdateFetchedUrl(url)
	
	if err != nil {
		fmt.Println(err)
		return
	}
	
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		// check cache if url is already fetched
		if !cache.IsUrlFetched(u) {
			wg.Add(1)
			go Crawl(u, depth-1, fetcher, cache, wg)
		}
	}
	return
}

// synced crawl using sync.Metux and sync.WaitGroup
func SyncCrawl(url string, depth int, fetcher Fetcher) {
	var wg sync.WaitGroup
	cache := Cache{urls: make(map[string]bool)}
	
	wg.Add(1)
	go Crawl(url, depth, fetcher, &cache, &wg)
	wg.Wait()
}

func main() {
	SyncCrawl("https://golang.org/", 4, fetcher)
	fmt.Println("done")
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

cirpi · 2023-09-26T12:50:36Z

package main

import (
	"fmt"
	"sync"
	"time"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type safemap struct {
	m sync.Mutex
	mm map[string]bool
}
var safe safemap = safemap{mm:make(map[string]bool)}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	safe.m.Lock()
	safe.mm[url] = true
	safe.m.Unlock()
	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		if _, visited := safe.mm[u]; !visited {
			go Crawl(u, depth-1, fetcher)
		}
	}
	return
}

func main() {
	Crawl("https://golang.org/", 4, fetcher)
	time.Sleep(time.Second)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

could you guys give me some feedback on this code?

JerrodTanner · 2024-03-07T18:58:19Z

here is my answer, using channels.

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	Fetch(url string) (body string, urls []string, err error)
}

type SafeCounter struct {
	mu sync.Mutex
	v  map[string]bool
}

func (c *SafeCounter) Inc(key string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.v[key] = true
}

func (c *SafeCounter) Visited(key string) bool {
	c.mu.Lock()
	defer c.mu.Unlock()
	return c.v[key]
}

func Crawl(url string, depth int, fetcher Fetcher, ch chan string, wg *sync.WaitGroup, c *SafeCounter) error {
	defer func() {
		if r := recover(); r != nil {
			fmt.Println("Recovered in Crawl:", r)
		}
	}()

	if depth <= 0 || c.Visited(url) {
		return nil
	}

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		return err
	}
	//body, urls, err := fetcher.Fetch(url)
	//fmt.Printf("found: %s %q\n", url, body)
	wg.Add(len(urls)) // Add the number of URLs to crawl before starting to crawl them

	ch <- url + " " + "'" + body + "'"
	c.Inc(url)

	for _, u := range urls {
		go func(u string) {
			defer wg.Done()
			Crawl(u, depth-1, fetcher, ch, wg, c)
		}(u)
	}

	return nil
}

func main() {
	ch := make(chan string)
	c := SafeCounter{v: make(map[string]bool)}
	var wg sync.WaitGroup

	// Increment the WaitGroup before starting the crawl
	wg.Add(1)
	go func() {
		defer wg.Done()
		if err := Crawl("https://golang.org/", 4, fetcher, ch, &wg, &c); err != nil {
			fmt.Println(err)
		}
	}()

	go func() {
		wg.Wait()
		close(ch)
	}()

	for result := range ch {
		fmt.Printf("found: %s \n", result)
	}

	wg.Wait() // Wait for all goroutines to finish before exiting
	//fmt.Println(c.v)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

devkaare · 2024-09-05T19:12:59Z

Here's my solution. Pretty neat imo!

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type SafeMap struct {
	mu sync.Mutex
	m  map[string]string
}

func (sm *SafeMap) addCache(url string) {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	sm.m[url] = url
}

func (sm *SafeMap) checkCache(url string) bool {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	_, ok := sm.m[url]
	return ok
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, sm *SafeMap) {
	defer wg.Done()

	if depth <= 0 {
		return
	}

	if sm.checkCache(url) {
		return
	}

	sm.addCache(url)

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher, sm)
	}
}

var wg sync.WaitGroup

func main() {
	sm := SafeMap{m: make(map[string]string)}

	wg.Add(1)
	Crawl("https://golang.org/", 4, fetcher, &sm)
	wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

harryhare/exercise-web-crawler.go

Select an option

No results found

Select an option

No results found

diserere commented May 23, 2022 •

edited

Loading

Uh oh!

alg commented Aug 21, 2022

Uh oh!

krapie commented Dec 13, 2022

Uh oh!

cirpi commented Sep 26, 2023 •

edited

Loading

Uh oh!

JerrodTanner commented Mar 7, 2024 •

edited

Loading

Uh oh!

devkaare commented Sep 5, 2024 •

edited

Loading

Uh oh!

	package main

	import (
	"fmt"
	"time"
	"sync"
	)

	type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
	}
	type SafeCounter struct {
	v map[string]bool
	mux sync.Mutex
	}
	var c SafeCounter = SafeCounter{v: make(map[string]bool)}
	func (s SafeCounter)checkvisited(url string)bool{
	s.mux.Lock()
	defer s.mux.Unlock()
	_,ok:=s.v[url]
	if ok==false {
	s.v[url]=true
	return false
	}
	return true

	}


	// Crawl uses fetcher to recursively crawl
	// pages starting with url, to a maximum of depth.
	func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	if depth <= 0 {
	return
	}
	if c.checkvisited(url) {
	return;
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
	fmt.Println(err)
	return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
	go Crawl(u, depth-1, fetcher)
	}
	return
	}

	func main() {
	Crawl("http://golang.org/", 4, fetcher)
	time.Sleep(5*time.Second)
	}

	// fakeFetcher is Fetcher that returns canned results.
	type fakeFetcher map[string]*fakeResult

	type fakeResult struct {
	body string
	urls []string
	}

	func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
	return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
	}

	// fetcher is a populated fakeFetcher.
	var fetcher = fakeFetcher{
	"http://golang.org/": &fakeResult{
	"The Go Programming Language",
	[]string{
	"http://golang.org/pkg/",
	"http://golang.org/cmd/",
	},
	},
	"http://golang.org/pkg/": &fakeResult{
	"Packages",
	[]string{
	"http://golang.org/",
	"http://golang.org/cmd/",
	"http://golang.org/pkg/fmt/",
	"http://golang.org/pkg/os/",
	},
	},
	"http://golang.org/pkg/fmt/": &fakeResult{
	"Package fmt",
	[]string{
	"http://golang.org/",
	"http://golang.org/pkg/",
	},
	},
	"http://golang.org/pkg/os/": &fakeResult{
	"Package os",
	[]string{
	"http://golang.org/",
	"http://golang.org/pkg/",
	},
	},
	}

harryhare/exercise-web-crawler.go

diserere commented May 23, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alg commented Aug 21, 2022

Uh oh!

krapie commented Dec 13, 2022

Uh oh!

cirpi commented Sep 26, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

JerrodTanner commented Mar 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

devkaare commented Sep 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

diserere commented May 23, 2022 •

edited

Loading

cirpi commented Sep 26, 2023 •

edited

Loading

JerrodTanner commented Mar 7, 2024 •

edited

Loading

devkaare commented Sep 5, 2024 •

edited

Loading