Skip to content

Instantly share code, notes, and snippets.

@lightory
Created June 28, 2016 14:05

Revisions

  1. lightory created this gist Jun 28, 2016.
    96 changes: 96 additions & 0 deletions crawl_yupoo.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,96 @@
    require 'open-uri'
    require 'Nokogiri'
    require 'digest/sha1'


    def main
    username = "lightory"
    base_url = "http://www.yupoo.com/photos/" + username + "/albums/"

    doc = Nokogiri::HTML(safe_open(base_url))
    album_elements = doc.css("div#albums_list div.set-case")
    for album_element in album_elements
    album_url = album_element.css("a").first["href"]
    crawl_album(album_url)
    end
    end


    def main2
    album_url = "http://www.yupoo.com/photos/lightory/albums/1373449/"
    crawl_album(album_url)
    end


    def crawl_album(base_album_url)
    page = 1;

    while true
    album_url = base_album_url + "page" + page.to_s + "/"
    album_doc = Nokogiri::HTML(safe_open(album_url))

    album_name = album_doc.css("span#albumtitle").first.content
    Dir.mkdir(album_name) unless File.exists?(album_name)
    puts "Start Crawling " + album_name + "..." if page == 1
    puts "Start Crawling " + album_name + " Page " + page.to_s + "..."

    if File.exists?(album_name + "/finished")
    puts "Already Cralwed " + album_name + "."
    puts ""
    break
    end

    photo_elements = album_doc.css("div.album-photos a.img")
    if photo_elements.length == 0
    puts "Finished Crawling " + album_name + "."
    puts ""
    File.open(album_name + "/finished", "w") do |f|
    f.write("")
    end
    break
    end
    for photo_element in photo_elements
    photo_title = photo_element["title"]
    photo_page_url = photo_element["href"]
    puts "Crawl Photo: " + photo_page_url

    file_name = album_name + "/" + Digest::SHA1.hexdigest(photo_page_url) + ".jpg"
    crawl_photo(photo_page_url, file_name)
    end

    puts "Finished Crawling " + album_name + " Page " + page.to_s + "."
    page = page + 1
    end
    end


    def crawl_photo(page_url, file_name)
    page_url = "http://www.yupoo.com" + page_url

    doc = Nokogiri::HTML(safe_open(page_url))
    photo_url = doc.css("img#photo_img").first["src"]

    open(file_name, 'wb') do |file|
    file_content = safe_open(photo_url)
    file << file_content.read unless file_content.nil?
    end
    end


    def safe_open(url)
    retryTime = 0
    begin
    sleep(1)
    return open(url)
    rescue
    if (retryTime >= 3)
    return
    end
    puts "Retry " + url
    retryTime = retryTime + 1
    retry
    end
    end


    main