Skip to content

Instantly share code, notes, and snippets.

@siffash
Last active February 17, 2018 00:43
Show Gist options
  • Save siffash/5c720f0d963e8a50c64da3c89abfd317 to your computer and use it in GitHub Desktop.
Save siffash/5c720f0d963e8a50c64da3c89abfd317 to your computer and use it in GitHub Desktop.
Extract all unique languages from OSM XML file + all unique prefixes & suffixes
<?
ini_set('display_errors', '0');
ini_set('memory_limit', '-1');
ini_set('max_execution_time', 0);
set_time_limit(0);
// FILE
$xml_file = 'planet-latest.osm';
echo 'Opening the file ' . $xml_file . '... ';
flush();
ob_flush();
$handle = fopen($xml_file, 'r') or exit('cannot open the file, the script is terminated.');
echo 'opened successfully<br/>Counting the lines... ';
flush();
ob_flush();
$total_lines = 0;
while(fgets($handle))
$total_lines++;
echo 'the file contains ' . $total_lines . ' lines<br/>Starting processing the file... see the progress below';
flush();
ob_flush();
rewind($handle);
$lines = 0;
$langs = [];
$examples = [];
$pref_suf = [];
$one_percent = round($total_lines * 0.01);
while($line = fgets($handle))
{
preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches);
if($matches)
{
$lang = $matches[1];
$example = $matches[2];
if($lang != 'prefix' and $lang != 'suffix')
{
if(array_key_exists($lang, $langs))
{
$langs[$lang]++;
if(count($examples[$lang]) < 6)
$examples[$lang][] = $example;
}
else
{
$langs[$lang] = 1;
$examples[$lang] = [$example];
}
}
}
unset($matches);
preg_match('/name:(?:prefix|suffix):?([^"]*)" v="([^"]+)/', $line, $matches);
if($matches)
{
$pr_sf = transliterator_transliterate('Any-Lower', $matches[2]) or strtolower($matches[2]);
$pr_sf_lang = $matches[1];
if(!array_key_exists($pr_sf, $pref_suf))
$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang];
elseif($pr_sf_lang != '')
{
if($pref_suf[$pr_sf] == 0)
$pref_suf[$pr_sf] = [$pr_sf_lang];
elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf]))
$pref_suf[$pr_sf][] = $pr_sf_lang;
}
}
$lines++;
if($lines % $one_percent == 0 or $lines == 2 or $lines == $total_lines)
{
echo '<br/>Current progress: ' . round($lines * 100 / $total_lines) . '%, ' . $lines . ' lines have been processed';
flush();
ob_flush();
}
}
fclose($handle);
echo '<p/>Script execution time: ' . (round(microtime(true)) - round($_SERVER['REQUEST_TIME_FLOAT'])) . ' secs. <b>'
. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.';
arsort($langs);
echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">'
. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>';
foreach($langs as $lang => $freq)
echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>';
echo '</table>';
ksort($pref_suf);
$pref_suf_output = [];
echo '<p/><b>Prefixes & suffixes:</b> ';
foreach($pref_suf as $pr_sf => $pr_sf_lang)
$pref_suf_output[] = '<span title="' . implode(', ', $pr_sf_lang) . '">\'' . $pr_sf . '\'</span>';
echo implode(', ', $pref_suf_output);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment