Last active
December 17, 2021 21:24
-
-
Save kimadactyl/a284f6aef0025aed3f0debe94cd612d9 to your computer and use it in GitHub Desktop.
Migrate Blogspot to Hugo page bundles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To grab site, do something like: | |
# wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains 1.bp.blogspot.com,2.bp.blogspot.com,3.bp.blogspot.com,4.bp.blogspot.com,myblog.blogspot.com, http://myblog.blogspot.com/ -P pass_01 | |
require 'date' | |
require 'fileutils' | |
require 'kramdown' | |
require 'logger' | |
require 'nokogiri' | |
require 'rake' | |
require 'sanitize' | |
INPUT_DIR = './scrape/pass_01/myblog.blogspot.com/' | |
OUTPUT_DIR = './output/pass_01' | |
missing_files = Logger.new('missing_files.log') | |
# Delete and remake the output directory | |
FileUtils.remove_dir(OUTPUT_DIR) | |
Dir.mkdir(OUTPUT_DIR) | |
# Get all HTML files in any directory | |
# Note this will skip txt files e.g. robots.txt | |
files = Dir.glob("#{INPUT_DIR}/**/*.html") | |
files.each do |file| | |
html = Nokogiri::HTML(File.open(file)) | |
basename = File.basename(file, ".html") | |
# Grab the main post body | |
content = html.at_css('div.post-body') | |
# Get the page title | |
begin | |
title = html.at_css("meta[property='og:title']")['content'] | |
rescue | |
title = "FIXME" | |
end | |
new_blog_path = basename | |
# Get the meta description from og tag | |
begin | |
description = html.at_css("meta[property='og:description']")['content'] | |
rescue | |
description = "FIXME" | |
end | |
# Get the image meta property from OG tag | |
begin | |
image = html.at_css("meta[property='og:image']")['content'] | |
rescue | |
image = "FIXME" | |
end | |
# Get the date, which we use to set the directory | |
begin | |
date = html.at_css("abbr[itemprop='datePublished']")['title'] | |
year = Date.parse(date).year | |
dir = "#{OUTPUT_DIR}/#{year}/#{new_blog_path}" | |
rescue | |
date = "FIXME" | |
dir = "#{OUTPUT_DIR}/no_year/#{new_blog_path}" | |
end | |
system 'mkdir', '-p', dir | |
# Move images around | |
content.xpath('//img').each do |img| | |
src = img['src'] | |
# Is this coming from a blogspot CDN? | |
if src.split('bp.blogspot.com/')[1] | |
img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0] | |
else | |
print "M" | |
missing_files.error("Missing file: #{src}") | |
break | |
end | |
# Move it to the new location in a page bundle | |
new_location = "#{dir}/#{File.basename(img_location)}" | |
begin | |
FileUtils.cp(img_location, new_location) | |
rescue | |
missing_files.error("Couldn't resolve file: #{src}") | |
end | |
img.attributes['src'].value = File.basename(new_location) | |
print "T" | |
end | |
content.xpath('//a[@imageanchor="1"]').each do |a| | |
src = a['href'] | |
if src.split('bp.blogspot.com/')[1] | |
img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0] | |
else | |
print "M" | |
missing_files.error("Missing file: #{src}") | |
break | |
end | |
new_location = "#{dir}/#{File.basename(img_location)}" | |
begin | |
FileUtils.cp(img_location, new_location) | |
rescue | |
missing_files.error("Couldn't resolve file: #{src}") | |
end | |
a.attributes['href'].value = File.basename(new_location) | |
print "I" | |
end | |
# puts "Converting: #{title} | #{date}" | |
# puts "#{description}" | |
# puts "#{image}" | |
# Configure markdown processor | |
sanitize_config = { | |
elements: %w[b em i strong u a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s samp small strike sub sup time ul var img table iframe], | |
:attributes=>{ | |
"a"=>["href"], | |
"abbr"=>["title"], | |
"blockquote"=>["cite"], | |
"dfn"=>["title"], | |
"q"=>["cite"], | |
"time"=>["datetime", "pubdate"], | |
"img"=>["alt", "src"], | |
"iframe"=>["alt", "src"] | |
}, | |
:protocols=>{ | |
"a"=>{"href"=>["ftp", "http", "https", "mailto", :relative]}, | |
"blockquote"=>{"cite"=>["http", "https", :relative]}, | |
"q"=>{"cite"=>["http", "https", :relative]}, | |
"img"=>{"src"=>["http", "https", :relative]}, | |
"iframe"=>{"src"=>["http", "https", :relative]} | |
} | |
} | |
# Convert the content to markdown | |
content = Sanitize.fragment(content, sanitize_config) | |
content = Kramdown::Document.new(content, html_to_native: true) | |
content = content.to_kramdown | |
# Create output file | |
output = <<~HEREDOC | |
--- | |
title: "#{title.gsub('"', '\"')}" | |
image: "#{File.basename(image)}" | |
date: #{date} | |
aliases: "/#{file.split('//')[1]}" | |
--- | |
#{content} | |
HEREDOC | |
# Write output to output dir and change extension | |
File.write("#{OUTPUT_DIR}/#{year}/#{new_blog_path}/index.md", output) | |
print "P" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment