Created
September 15, 2012 11:58
-
-
Save nkuln/3727492 to your computer and use it in GitHub Desktop.
Download all images from a Wordpress posts, and then replace all the existing URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MySQLdb as mdb | |
import HTMLParser | |
import random | |
import urllib2 | |
import urlparse | |
from bs4 import BeautifulSoup | |
from os import path | |
class ImageMigrator: | |
def __init__(self, | |
imgdir='migrate_images', | |
imgexts=['jpg', 'jpeg', 'gif', 'bmp', 'png'], | |
oldbaseurl=u'http://www.oldblog.net', | |
newbaseurl=u'http://www.example.com/migrate_images'): | |
self.imgdir = imgdir | |
self.imgexts = imgexts | |
self.oldbaseurl = oldbaseurl | |
self.newbaseurl = newbaseurl | |
def download_images_and_create_mappings(self, html): | |
imglinks = self._extract_image_links(html) | |
urlmap = {} | |
for link in imglinks: | |
try: | |
link = urlparse.urljoin(self.oldbaseurl, link) | |
filename = self._filename_from_url(link) | |
savepath = path.join(self.imgdir, filename) | |
print "downloading from", link, ".." | |
imgfile = urllib2.urlopen(link) | |
while path.isfile(savepath): | |
filename = str(random.randint(0,99)) + filename | |
savepath = path.join(self.imgdir, filename) | |
f = open(savepath, 'wb') | |
f.write(imgfile.read()) | |
f.close() | |
imgfile.close() | |
newurl = urlparse.urljoin(self.newbaseurl, filename) | |
print "image downloaded", newurl | |
urlmap[link] = newurl | |
except urllib2.HTTPError: | |
print "**** cannot download image from ", link, " due to HTTP error! ****" | |
except urllib2.URLError: | |
print "**** cannot download image from ", link, " due to URL error! ****" | |
except: | |
print "**** cannot download image from ", link, " due to unknown error! ****" | |
return urlmap | |
def _filename_from_url(self, url): | |
url = url.replace('?','/') | |
url = url.replace('#','/') | |
url = url.replace('=','/') | |
return url.split('/')[-1] | |
def _extract_image_links(self, html): | |
try: | |
soup = BeautifulSoup(html) | |
links = [] | |
imgs = soup.find_all('img') | |
links += [ x['src'] for x in imgs if x.has_attr('src') ] | |
anchors = soup.find_all('a') | |
links += [ x['href'] for x in anchors if x.has_attr('href') ] | |
imglinks = list(set(filter(self._is_image_link, links))) | |
return imglinks | |
except HTMLParser.HTMLParseError: | |
print "**** cannot parse the HTML from post ****" | |
return [] | |
def _is_image_link(self, link): | |
l = link.lower() | |
for ext in self.imgexts: | |
if l.endswith(ext): return True | |
return False | |
def apply_mapping(self, urlmap, html): | |
ret = html | |
for k,v in urlmap.items(): | |
ret = ret.replace(k, v) | |
f = open('migrate_images/output.html','a') | |
f.write(ret.encode('UTF-8')) | |
f.close() | |
return ret | |
def relative_to_absolute_url(self, html): | |
urlmap = {} | |
imglinks = self._extract_image_links(html) | |
for link in imglinks: | |
if link.startswith('/') or link.startswith('.'): | |
filename = self._filename_from_url(link) | |
newurl = urlparse.urljoin(self.newbaseurl, filename) | |
print 'map %s to %s' % (link, newurl) | |
urlmap[link] = newurl | |
return urlmap | |
if __name__ == '__main__': | |
db = mdb.connect(host='localhost', db='lifeonvm', user='root', charset='utf8') | |
migrator = ImageMigrator(oldbaseurl='http://www.solidskill.net/', | |
newbaseurl='http://static.nkuln.com/uploaded/') | |
cursor = db.cursor() | |
cursor.execute('SELECT * from wp_posts') | |
rows = cursor.fetchall() | |
for row in rows: | |
post_id = row[0] | |
post_content = row[4] | |
print "==== PROCESSING POST ID = %s ====" % post_id | |
urlmap = migrator.download_images_and_create_mappings(post_content) | |
# Quick patch to change relative to absolute URLs .. | |
#urlmap = migrator.relative_to_absolute_url(post_content) | |
data = migrator.apply_mapping(urlmap, post_content) | |
ret = cursor.execute('UPDATE wp_posts SET post_content=%s WHERE ID=%s', (data, post_id)) | |
print "row effected by update =", ret, "id=", post_id | |
print "commiting changes .." | |
db.commit() | |
print "commiting done!" | |
db.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You have a WordPress blog?