Created
July 20, 2023 13:20
-
-
Save adamfranco/15f0cdf716b43a82bc74eda3d3b40b7a to your computer and use it in GitHub Desktop.
Fix image paths in an archive of sites.middlebury.edu/middmag-old/ produced by HTTrack
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php | |
<?php | |
$file = $argv[1]; | |
if (!preg_match('#\.html$#', $file)) { | |
throw new Exception("$file must be a .html file."); | |
} | |
if (preg_match('#(.+)sites.middlebury.edu/middmag-old/(.+)#', $file, $pathMatches)) { | |
$directory = dirname($pathMatches[2]); | |
if ($directory == '.') { | |
$sitePrefix = ''; | |
} else { | |
$path = explode("/", $directory); | |
$depth = count($path); | |
$sitePrefix = implode("/", array_fill(0, $depth, '..')).'/'; | |
} | |
} else { | |
throw new Exception("File must be within sites.middlebury.edu/middmag-old/, $file given."); | |
} | |
var_dump($directory, $path, $depth, $sitePrefix); | |
$html = $origHtml = file_get_contents($file); | |
$patterns = []; | |
###################################################### | |
# Replacement patterns and functions. | |
###################################################### | |
# Images under sites.middlebury.edu/middmag/ | |
$patterns['# (src|href)="((?:\.\./)+)\.\./middmag/(files/(?:[^/"]+/)*([^"/]+))"#'] = 'fix_middmag_files'; | |
function fix_middmag_files($file, $html, $sitePrefix, $m) { | |
$srcAttra = $m[0]; | |
$relativePrefix = $m[2]; | |
$filePath = $m[3]; | |
$newAttr = ' '.$m[1].'="'.$relativePrefix.$filePath.'"'; | |
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n"); | |
return str_replace($srcAttra, $newAttr, $html); | |
} | |
# Images under middleburymagazine.com/ | |
$patterns['# (src|href)="((?:\.\./)+)\.\./\.\./middleburymagazine\.com/(files/(?:[^/"]+/)*([^"/]+))"#'] = 'fix_middleburymagazine_com_files'; | |
function fix_middleburymagazine_com_files($file, $html, $sitePrefix, $m) { | |
$srcAttra = $m[0]; | |
$relativePrefix = $m[2]; | |
$path = dirname($m[3]); | |
$fileName = basename($m[3]); | |
// Look for a png, jpg or real path. | |
if (preg_match('/\.html$/', $fileName)) { | |
$realPath = realpath(dirname($file).'/'.$relativePrefix.$path); | |
$fileNameBase = pathinfo($fileName, PATHINFO_FILENAME); | |
foreach (scandir($realPath) as $f) { | |
// Strip the extension and compare. | |
$base = pathinfo($f, PATHINFO_FILENAME); | |
if ($base == $fileNameBase) { | |
$fileName = $f; | |
break; | |
} | |
} | |
} | |
$newAttr = ' '.$m[1].'="'.$relativePrefix.$path.'/'.$fileName.'"'; | |
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n"); | |
return str_replace($srcAttra, $newAttr, $html); | |
} | |
# Links to sites.middlebury.edu/middmag/ | |
$patterns['# (href|src)="https?://(?:sites\.middlebury\.edu/middmag|middmag\.com)/([^"]+)"#'] = 'fix_middmag_links'; | |
function fix_middmag_links($file, $html, $sitePrefix, $m) { | |
$srcAttra = $m[0]; | |
$u = parse_url($m[2]); | |
if (empty($u['path'])) { | |
$pagePath = 'index.html'; | |
} else { | |
$pagePath = $u['path']; | |
} | |
// Trailing slash as expected | |
if (preg_match('#/$#', $pagePath)) { | |
// Ending with a trailing slash as expected for directories. | |
$pagePath = $pagePath.'index.html'; | |
} | |
// Ending with a filename. | |
elseif (preg_match('#^.+/([^/]+\.\w+)$#', $pagePath)) { | |
// Nothing to do. | |
} | |
// Odd special case seen. | |
elseif($pagePath == '2014/02/26/uncle-donnie-t…s-on-the-world') { | |
$pagePath = '2014/02/26/uncle-donnie-takes-on-the-world/index.html'; | |
} | |
// ending with no file extension or trailing slash. | |
else { | |
$pagePath = $pagePath.'/index.html'; | |
} | |
# Add query string. | |
if (!empty($u['query'])) { | |
$pagePath .= '?'.$u['query']; | |
} | |
# Add fragments | |
if (!empty($u['fragment'])) { | |
$pagePath .= '#'.$u['fragment']; | |
} | |
$newAttr = ' '.$m[1].'="'.$sitePrefix.$pagePath.'"'; | |
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n"); | |
return str_replace($srcAttra, $newAttr, $html); | |
} | |
# Remove onclick from branding. | |
$patterns['# onclick="location.href=\'http://sites.middlebury.edu/middmag-old\';"#'] = 'remove_branding_onclick'; | |
function remove_branding_onclick($file, $html, $sitePrefix, $m) { | |
$srcAttra = $m[0]; | |
$newAttr = 'onclick="location.href=\''.$sitePrefix.'index.html\';"'; | |
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n"); | |
return str_replace($srcAttra, $newAttr, $html); | |
} | |
# Fix srcset urls. | |
$patterns['# srcset="([^"]+)"#'] = 'fix_srcset_urls'; | |
function fix_srcset_urls($file, $html, $sitePrefix, $m) { | |
$srcAttra = $m[0]; | |
$contents = $m[1]; | |
$contents = str_replace('https://sites.middlebury.edu/middmag-old/', $sitePrefix, $contents); | |
$newAttr = 'srcset="'. $contents .'"'; | |
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n"); | |
return str_replace($srcAttra, $newAttr, $html); | |
} | |
###################################################### | |
# Loop through our patterns and make changes. | |
###################################################### | |
foreach ($patterns as $regex => $callback) { | |
if (preg_match_all($regex, $html, $matches, PREG_SET_ORDER)) { | |
foreach ($matches as $match) { | |
$html = call_user_func($callback, $file, $html, $sitePrefix, $match); | |
} | |
} | |
} | |
if ($html != $origHtml) { | |
file_put_contents($file, $html); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment