Last active
February 17, 2018 00:43
-
-
Save siffash/5c720f0d963e8a50c64da3c89abfd317 to your computer and use it in GitHub Desktop.
Extract all unique languages from OSM XML file + all unique prefixes & suffixes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? | |
ini_set('display_errors', '0'); | |
ini_set('memory_limit', '-1'); | |
ini_set('max_execution_time', 0); | |
set_time_limit(0); | |
// FILE | |
$xml_file = 'planet-latest.osm'; | |
echo 'Opening the file ' . $xml_file . '... '; | |
flush(); | |
ob_flush(); | |
$handle = fopen($xml_file, 'r') or exit('cannot open the file, the script is terminated.'); | |
echo 'opened successfully<br/>Counting the lines... '; | |
flush(); | |
ob_flush(); | |
$total_lines = 0; | |
while(fgets($handle)) | |
$total_lines++; | |
echo 'the file contains ' . $total_lines . ' lines<br/>Starting processing the file... see the progress below'; | |
flush(); | |
ob_flush(); | |
rewind($handle); | |
$lines = 0; | |
$langs = []; | |
$examples = []; | |
$pref_suf = []; | |
$one_percent = round($total_lines * 0.01); | |
while($line = fgets($handle)) | |
{ | |
preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches); | |
if($matches) | |
{ | |
$lang = $matches[1]; | |
$example = $matches[2]; | |
if($lang != 'prefix' and $lang != 'suffix') | |
{ | |
if(array_key_exists($lang, $langs)) | |
{ | |
$langs[$lang]++; | |
if(count($examples[$lang]) < 6) | |
$examples[$lang][] = $example; | |
} | |
else | |
{ | |
$langs[$lang] = 1; | |
$examples[$lang] = [$example]; | |
} | |
} | |
} | |
unset($matches); | |
preg_match('/name:(?:prefix|suffix):?([^"]*)" v="([^"]+)/', $line, $matches); | |
if($matches) | |
{ | |
$pr_sf = transliterator_transliterate('Any-Lower', $matches[2]) or strtolower($matches[2]); | |
$pr_sf_lang = $matches[1]; | |
if(!array_key_exists($pr_sf, $pref_suf)) | |
$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang]; | |
elseif($pr_sf_lang != '') | |
{ | |
if($pref_suf[$pr_sf] == 0) | |
$pref_suf[$pr_sf] = [$pr_sf_lang]; | |
elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf])) | |
$pref_suf[$pr_sf][] = $pr_sf_lang; | |
} | |
} | |
$lines++; | |
if($lines % $one_percent == 0 or $lines == 2 or $lines == $total_lines) | |
{ | |
echo '<br/>Current progress: ' . round($lines * 100 / $total_lines) . '%, ' . $lines . ' lines have been processed'; | |
flush(); | |
ob_flush(); | |
} | |
} | |
fclose($handle); | |
echo '<p/>Script execution time: ' . (round(microtime(true)) - round($_SERVER['REQUEST_TIME_FLOAT'])) . ' secs. <b>' | |
. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.'; | |
arsort($langs); | |
echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">' | |
. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>'; | |
foreach($langs as $lang => $freq) | |
echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>'; | |
echo '</table>'; | |
ksort($pref_suf); | |
$pref_suf_output = []; | |
echo '<p/><b>Prefixes & suffixes:</b> '; | |
foreach($pref_suf as $pr_sf => $pr_sf_lang) | |
$pref_suf_output[] = '<span title="' . implode(', ', $pr_sf_lang) . '">\'' . $pr_sf . '\'</span>'; | |
echo implode(', ', $pref_suf_output); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment