kimadactyl · December 17, 2021 21:24
diff --git a/blogspot-to-hugo-converter.rb b/blogspot-to-hugo-converter.rb
 # To grab site, do something like:
 # wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains 1.bp.blogspot.com,2.bp.blogspot.com,3.bp.blogspot.com,4.bp.blogspot.com,myblog.blogspot.com,  http://myblog.blogspot.com/ -P pass_01

 require 'date'
 require 'fileutils'
 require 'kramdown'
 require 'logger'
 require 'nokogiri'
 require 'rake'
 require 'sanitize'

 INPUT_DIR = './scrape/pass_01/myblog.blogspot.com/'
 OUTPUT_DIR = './output/pass_01'

 missing_files = Logger.new('missing_files.log')

 # Delete and remake the output directory
 FileUtils.remove_dir(OUTPUT_DIR)
 Dir.mkdir(OUTPUT_DIR)

 # Get all HTML files in any directory
 # Note this will skip txt files e.g. robots.txt
 files = Dir.glob("#{INPUT_DIR}/**/*.html")

 files.each do |file|
  html = Nokogiri::HTML(File.open(file))
  basename =  File.basename(file, ".html")

  # Grab the main post body
  content = html.at_css('div.post-body')

  # Get the page title
  begin
    title = html.at_css("meta[property='og:title']")['content']
  rescue 
    title = "FIXME"
  end

  new_blog_path = basename

  # Get the meta description from og tag
  begin
    description = html.at_css("meta[property='og:description']")['content']
  rescue 
    description = "FIXME"
  end

  # Get the image meta property from OG tag
  begin
    image = html.at_css("meta[property='og:image']")['content']
  rescue 
    image = "FIXME"
  end

  # Get the date, which we use to set the directory
  begin
    date = html.at_css("abbr[itemprop='datePublished']")['title']
    year = Date.parse(date).year
    dir = "#{OUTPUT_DIR}/#{year}/#{new_blog_path}"
  rescue 
    date = "FIXME"
    dir =  "#{OUTPUT_DIR}/no_year/#{new_blog_path}"
  end
  system 'mkdir', '-p', dir

  # Move images around
  content.xpath('//img').each do |img|
    src = img['src']

    # Is this coming from a blogspot CDN?
    if src.split('bp.blogspot.com/')[1]
      img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
    else
      print "M"
      missing_files.error("Missing file: #{src}")
      break
    end

    # Move it to the new location in a page bundle
    new_location = "#{dir}/#{File.basename(img_location)}"
    begin
      FileUtils.cp(img_location, new_location)
    rescue
      missing_files.error("Couldn't resolve file: #{src}")
    end
    img.attributes['src'].value = File.basename(new_location)
    print "T"
  end


  content.xpath('//a[@imageanchor="1"]').each do |a|
    src = a['href']

    if src.split('bp.blogspot.com/')[1]
      img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
    else
      print "M"
      missing_files.error("Missing file: #{src}")
      break
    end
    new_location = "#{dir}/#{File.basename(img_location)}"
    begin
      FileUtils.cp(img_location, new_location)
    rescue
      missing_files.error("Couldn't resolve file: #{src}")
    end
    a.attributes['href'].value = File.basename(new_location)
    print "I"
  end
  
  # puts "Converting: #{title} | #{date}"
  # puts "#{description}"
  # puts "#{image}"

  # Configure markdown processor
  sanitize_config = {
    elements: %w[b em i strong u a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s samp small strike sub sup time ul var img table iframe],
    :attributes=>{
      "a"=>["href"], 
      "abbr"=>["title"], 
      "blockquote"=>["cite"], 
      "dfn"=>["title"], 
      "q"=>["cite"], 
      "time"=>["datetime", "pubdate"],
      "img"=>["alt", "src"],
      "iframe"=>["alt", "src"]
    },
    :protocols=>{
      "a"=>{"href"=>["ftp", "http", "https", "mailto", :relative]}, 
      "blockquote"=>{"cite"=>["http", "https", :relative]}, 
      "q"=>{"cite"=>["http", "https", :relative]},
      "img"=>{"src"=>["http", "https", :relative]},
      "iframe"=>{"src"=>["http", "https", :relative]}
    }
  }

  # Convert the content to markdown
  content = Sanitize.fragment(content, sanitize_config)
  content = Kramdown::Document.new(content, html_to_native: true)
  content = content.to_kramdown

  # Create output file
  output = <<~HEREDOC
    ---
    title: "#{title.gsub('"', '\"')}"
    image: "#{File.basename(image)}"
    date: #{date}
    aliases: "/#{file.split('//')[1]}"
    ---

    #{content}
  HEREDOC

  # Write output to output dir and change extension
  File.write("#{OUTPUT_DIR}/#{year}/#{new_blog_path}/index.md", output)
  print "P"
 end
	# To grab site, do something like:
	# wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains 1.bp.blogspot.com,2.bp.blogspot.com,3.bp.blogspot.com,4.bp.blogspot.com,myblog.blogspot.com, http://myblog.blogspot.com/ -P pass_01

	require 'date'
	require 'fileutils'
	require 'kramdown'
	require 'logger'
	require 'nokogiri'
	require 'rake'
	require 'sanitize'

	INPUT_DIR = './scrape/pass_01/myblog.blogspot.com/'
	OUTPUT_DIR = './output/pass_01'

	missing_files = Logger.new('missing_files.log')

	# Delete and remake the output directory
	FileUtils.remove_dir(OUTPUT_DIR)
	Dir.mkdir(OUTPUT_DIR)

	# Get all HTML files in any directory
	# Note this will skip txt files e.g. robots.txt
	files = Dir.glob("#{INPUT_DIR}/*/.html")

	files.each do \|file\|
	html = Nokogiri::HTML(File.open(file))
	basename = File.basename(file, ".html")

	# Grab the main post body
	content = html.at_css('div.post-body')

	# Get the page title
	begin
	title = html.at_css("meta[property='og:title']")['content']
	rescue
	title = "FIXME"
	end

	new_blog_path = basename

	# Get the meta description from og tag
	begin
	description = html.at_css("meta[property='og:description']")['content']
	rescue
	description = "FIXME"
	end

	# Get the image meta property from OG tag
	begin
	image = html.at_css("meta[property='og:image']")['content']
	rescue
	image = "FIXME"
	end

	# Get the date, which we use to set the directory
	begin
	date = html.at_css("abbr[itemprop='datePublished']")['title']
	year = Date.parse(date).year
	dir = "#{OUTPUT_DIR}/#{year}/#{new_blog_path}"
	rescue
	date = "FIXME"
	dir = "#{OUTPUT_DIR}/no_year/#{new_blog_path}"
	end
	system 'mkdir', '-p', dir

	# Move images around
	content.xpath('//img').each do \|img\|
	src = img['src']

	# Is this coming from a blogspot CDN?
	if src.split('bp.blogspot.com/')[1]
	img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
	else
	print "M"
	missing_files.error("Missing file: #{src}")
	break
	end

	# Move it to the new location in a page bundle
	new_location = "#{dir}/#{File.basename(img_location)}"
	begin
	FileUtils.cp(img_location, new_location)
	rescue
	missing_files.error("Couldn't resolve file: #{src}")
	end
	img.attributes['src'].value = File.basename(new_location)
	print "T"
	end


	content.xpath('//a[@imageanchor="1"]').each do \|a\|
	src = a['href']

	if src.split('bp.blogspot.com/')[1]
	img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
	else
	print "M"
	missing_files.error("Missing file: #{src}")
	break
	end
	new_location = "#{dir}/#{File.basename(img_location)}"
	begin
	FileUtils.cp(img_location, new_location)
	rescue
	missing_files.error("Couldn't resolve file: #{src}")
	end
	a.attributes['href'].value = File.basename(new_location)
	print "I"
	end

	# puts "Converting: #{title} \| #{date}"
	# puts "#{description}"
	# puts "#{image}"

	# Configure markdown processor
	sanitize_config = {
	elements: %w[b em i strong u a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s samp small strike sub sup time ul var img table iframe],
	:attributes=>{
	"a"=>["href"],
	"abbr"=>["title"],
	"blockquote"=>["cite"],
	"dfn"=>["title"],
	"q"=>["cite"],
	"time"=>["datetime", "pubdate"],
	"img"=>["alt", "src"],
	"iframe"=>["alt", "src"]
	},
	:protocols=>{
	"a"=>{"href"=>["ftp", "http", "https", "mailto", :relative]},
	"blockquote"=>{"cite"=>["http", "https", :relative]},
	"q"=>{"cite"=>["http", "https", :relative]},
	"img"=>{"src"=>["http", "https", :relative]},
	"iframe"=>{"src"=>["http", "https", :relative]}
	}
	}

	# Convert the content to markdown
	content = Sanitize.fragment(content, sanitize_config)
	content = Kramdown::Document.new(content, html_to_native: true)
	content = content.to_kramdown

	# Create output file
	output = <<~HEREDOC
	---
	title: "#{title.gsub('"', '\"')}"
	image: "#{File.basename(image)}"
	date: #{date}
	aliases: "/#{file.split('//')[1]}"
	---

	#{content}
	HEREDOC

	# Write output to output dir and change extension
	File.write("#{OUTPUT_DIR}/#{year}/#{new_blog_path}/index.md", output)
	print "P"
	end