Last active
December 21, 2015 23:38
-
-
Save abjdiat/6383492 to your computer and use it in GitHub Desktop.
PS3 Friendly Arabic Subs Converter
Perl script to convert UTF-8 encoded Arabic subtitles to Ps3-Friendly format
for scn shots and examples ,visit http://abjdiaty.blogspot.com/2012/01/ps3-friendly-arabic-subs-converter.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#changelog http://abjdiaty.blogspot.com/2012/01/ps3-friendly-arabic-subs-converter.html | |
#email [email protected] | |
use utf8; | |
no warnings; | |
open IN, "<:encoding(utf-8)", $ARGV[0]; | |
open OUT, ">:encoding(utf-8)", temp.srt; | |
while (<IN>) { | |
print OUT | |
} | |
$new_name=$ARGV[0]; | |
$new_name=~ s/(.+)\.[^.]+$/$1/; | |
$new_name.="_ps3.srt"; | |
open FILE2,"<:utf8", temp.srt or die $!; | |
open (FILE,">$new_name"); | |
my @lines = <FILE2>; | |
sub shiftrate{ | |
$_[0]=$_[0]-$_[1]; | |
} | |
#this is array of arrays, each arabic letter has its own array, bcoz arabic letter has many positions within the word, | |
#36 arrays, each has 5 values , we used the unicode value of the seperated letter as a reference | |
#################################################### | |
#is this arabic word? , we tell by knowing the unicode value of the first letters, later on , i might scan the whole word, but i think it's @ the moment, thstre first letter is enough | |
sub isItArabic{ | |
@array1=(1563..1791,65267,65266); | |
@ar_word=split('',$_[0]); | |
#we use this sub to judge wether this word starts with arabic letter or not based on its unicode value | |
foreach(@ar_word){ | |
if (ord($_) ~~ @array1){ | |
return "true"; | |
#the sub returns true if the condition is met, nothing otherwise | |
}#end of if | |
}#end of foreach | |
}#end of sub | |
################################################ | |
#this sub used to strip out harakat out of words, so we can analyse the proper postition of each letter | |
#1567=? | |
sub clean_the_ar_word{ | |
$letters=$_[0]; | |
@letters=split('',$letters); | |
foreach(@letters){ | |
$_=ord($_); | |
} | |
for ( $i=0;$i<scalar(@letters);$i++) | |
{ | |
($letters[$i] ~~ @array1)?$letters[$i]=$letters[$i]:splice @letters, $i, 1; | |
} | |
return @letters; | |
} | |
############################################### | |
#@arabic_glyphs[1632]=([1632,1632,1632,1632,1632,3]); | |
#@arabic_glyphs[1633]=([1633,1633,1633,1633,1633,3]); | |
#@arabic_glyphs[1634]=([1634,1634,1634,1634,1634,3]); | |
#@arabic_glyphs[1635]=([1635,1635,1635,1635,1635,3]); | |
#@arabic_glyphs[1636]=([1636,1636,1636,1636,1636,3]); | |
#@arabic_glyphs[1637]=([1637,1637,1637,1637,1637,3]); | |
#@arabic_glyphs[1638]=([1638,1638,1638,1638,1638,3]); | |
#@arabic_glyphs[1639]=([1639,1639,1639,1639,1639,3]); | |
#@arabic_glyphs[1640]=([1640,1640,1640,1640,1640,3]); | |
#@arabic_glyphs[1641]=([1641,1641,1641,1641,1641,3]); | |
@arabic_glyphs[1569]=([1569,65152,65163,65164,65152,3]); | |
@arabic_glyphs[1570]=([1570,65153,65153,65154,65154,2 ]); | |
@arabic_glyphs[1571]=([1571,65155,65155,65156,65156,2 ]); | |
@arabic_glyphs[1572]=([1572,65157,65157,65158,65158,2 ]); | |
@arabic_glyphs[1573]=([1573,65159,65159,65160,65160,2]); | |
@arabic_glyphs[1575]=([1575,65165,65165,65166,65166,2]); | |
@arabic_glyphs[1576]=([1576,65167,65169,65170,65168,4]); | |
@arabic_glyphs[1577]=([1577,65171,65171,65172,65172,2 ]); | |
@arabic_glyphs[1578]=([1578,65173,65175,65176,65174,4]); | |
@arabic_glyphs[1579]=([1579,65177,65179,65180,65178,4 ]); | |
@arabic_glyphs[1580]=([1580,65181,65183,65184,65182,4 ]); | |
@arabic_glyphs[1581]=([1581,65185,65187,65188,65186,4 ]); | |
@arabic_glyphs[1582]=([1582,65189,65191,65192,65190,4 ]); | |
@arabic_glyphs[1583]=([1583,65193,65193,65194,65194,2]); | |
@arabic_glyphs[1584]=([1584,65195,65195,65196,65196,2]); | |
@arabic_glyphs[1585]=([1585,65197,65197,65198,65198,2 ]); | |
@arabic_glyphs[1586]=([1586,65199,65199,65200,65200,2 ]); | |
@arabic_glyphs[1587]=([1587,65201,65203,65204,65202,4]); | |
@arabic_glyphs[45]=([45,45,45,45,45,4]); | |
@arabic_glyphs[1588]=([1588,65205,65207,65208,65206,4]); | |
@arabic_glyphs[1589]=([1589,65209,65211,65212,65210,4 ]); | |
@arabic_glyphs[1590]=([1590,65213,65215,65216,65214,4]); | |
@arabic_glyphs[1591]=([1591,65217,65219,65218,65220,4]); | |
@arabic_glyphs[1592]=([1592,65221,65223,65222,65222,4]); | |
@arabic_glyphs[1593]=([ 1593,65225,65227,65228,65226,4]); | |
@arabic_glyphs[1594]=([1594,65229,65231,65232,65230,4]); | |
@arabic_glyphs[1601]=([1601,65233,65235,65236,65234,4]); | |
@arabic_glyphs[1602]=([1602,65237,65239,65240,65238,4]); | |
@arabic_glyphs[1603]=([1603,65241,65243,65244,65242,4]); | |
@arabic_glyphs[1604]=([1604,65245,65247,65248,65246,4]); | |
@arabic_glyphs[1605]=([1605,65249,65251,65252,65250,4]); | |
@arabic_glyphs[1606]=([1606,65253,65255,65256,65254,4]); | |
@arabic_glyphs[1607]=([1607,65257,65259,65260,65258,4]); | |
@arabic_glyphs[1608]=([1608,65261,65261,65262,65262,2]); | |
@arabic_glyphs[1609]=([1609,65263,65263,65264,65264,2]); | |
@arabic_glyphs[1610]=([1610,65265,65267,65268,65266,4]); | |
@arabic_glyphs[1574]=([1574,65161,65163,65164,65162,2 ]); | |
#? | |
@arabic_glyphs[1567]=([1567,1567,1567,1567,1567,1 ]); | |
#Arabic Letter Jeh | |
@arabic_glyphs[1688]=([1688,64394,64394,64395,64395,2]); | |
#Arabic Letter Tcheh | |
@arabic_glyphs[1670]=([1670,64378,64380,64381,64379,4]); | |
#letter Peh. initial, middle, final | |
@arabic_glyphs[1662]=([1662,64342,64344,64345,64343,4]); | |
#Arabic Letter Swash Kaf | |
@arabic_glyphs[1705]=([1705,64398,64400,64401,64399,4]); | |
#Arabic Letter Gaf | |
@arabic_glyphs[1711]=([1711,64402,64404,64405,64403,4]); | |
@arabic_glyphs[1740]=([1740,64508,64510,64511,64509,4]); | |
@arabic_glyphs[1728]=([172,64420,64510,64420,64421,4]); | |
#dividing the words into letters, each has its own position, needed later to convert the exact postition of the letters in it's correct unicode value | |
#i already converted the word into sequence of items (array), so the loop here has numerator based upon we give the letter it's position value, first=1, middle=2, #last=3 | |
sub convertword2univalues{ | |
@letters_rev=(); | |
if (isItArabic($_[0])){ | |
@letters=clean_the_ar_word($_[0]); | |
#@letters=split('',$_[0]); | |
for ($i=0;$i<scalar(@letters);$i++){ | |
if ($i==0){ | |
$letters[$i]=$letters[$i]; | |
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][2])} | |
} | |
if ($i!=0 and $i !=scalar(@letters)-1) { | |
$letters[$i]=$letters[$i]; | |
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][3])}; | |
} | |
if ($i==scalar(@letters)-1 and $i>0){ | |
$letters[$i]=$letters[$i]; | |
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][4]); | |
} | |
} | |
}} | |
else { | |
#$_=~ s/([0-9]*)([0-9]*)/$1 $2/g; | |
$_=~ s/0/٠/g; | |
$_=~ s/1/١/g; | |
$_=~ s/2/٢/g; | |
$_=~ s/3/٣/g; | |
$_=~ s/4/٤/g; | |
$_=~ s/5/٥/g; | |
$_=~ s/6/٦/g; | |
$_=~ s/7/٧/g; | |
$_=~ s/8/٨/g; | |
$_=~ s/9/٩/g; | |
$_=~ s/%/٪/g; | |
$_=~ s/\?/؟/g; | |
print FILE "$_" ; | |
} | |
@isolated_letters=(65199,65153,65166,65200,65261,64394,64395,65193,65194,65195,65196,65197,65198,65157,65158,65262,65155,65154,65165,65159,65160,65166,65156); | |
@connect_to_short=(65170,65176,65180,65184,65188,65192,65204,65262,65208,65212,65216,65220,65224,65228,65232,65236,65240, | |
65244,65248,65252,65256,65260,65268,65198,65166,65194,65172,65254,65250,65246,65242,65238, | |
65234,65230,65226,65222,65218,65214,65210,65174,65206,65202,65200,65198,65196,65194,65190, | |
65186,65182,65178,65168,65160,65162,65158,65156,65154,65258,65266,65264,65164,64343,64345,64401,64399,64405,64403,64395,64395,64381,64379,64511,64509,64421); | |
#rm | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i] ~~ @isolated_letters ) | |
{ | |
if ($letters_rev[$i+1] ~~ @connect_to_short) | |
{shiftrate($letters_rev[$i+1],+1); | |
}} | |
} | |
#hamza_middle_preceeded_by_mad | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if (($letters_rev[$i]==65163) and $letters_rev[$i-1]~~ @isolated_letters and ($letters_rev[$i+1]~~@isolated_letters )){ | |
$letters_rev[$i]=1569; | |
} | |
} | |
#ARABIC LETTER ALEF ISOLATED FORM+ARABIC LETTER LAM MEDIAL FORM | |
# الـ ال | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65165 and $letters_rev[$i+1]==65248){ | |
shiftrate($letters_rev[$i+1],+1); | |
} | |
} | |
#ARABIC LETTER YEH MEDIAL FORM+ARABIC LETTER HAMZA ISOLATED FORM | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65268 ){ | |
if($letters_rev[$i+1]==65152){ | |
shiftrate($letters_rev[$i],+2); | |
} | |
if ($letters_rev[$i+1]==65163){ | |
shiftrate($letters_rev[$i+1],-1); | |
} | |
} | |
} | |
#ARABIC LETTER ALEF FINAL+ARABIC LETTER LAM MEDIAL FORM | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65166 and $letters_rev[$i+1]==65248){ | |
shiftrate($letters_rev[$i+1],+1); | |
} | |
} | |
#special letters: ﻵ ﻶ ﻷ ﻸ ﻹ ﻺ ﻻ ﻼ | |
##special characters, these characters special bcoz they are displayed as one glyph while perl interpret them as two | |
#for ($i=0;$i<scalar(@letters_rev);$i++){ | |
#if ($letters_rev[$i]==65267 and $letters_rev[$i+1]==65266){ | |
#$letters_rev[$i]=65271; | |
#}} | |
#alef_lam_middle_correction | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65166 and $letters_rev[$i+1]==65248){ | |
shiftrate($letters_rev[$i+1],+1); | |
} | |
} | |
#arabic letter lam initial form + arabic letter alef final form | |
#لـا | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65247 and $letters_rev[$i+1]==65166){ | |
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev; | |
$letters_rev[$i]=65275; | |
} | |
} | |
#arabic letter lam middle form + arabic letter final form | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65248 and $letters_rev[$i+1]==65166){ | |
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev; | |
$letters_rev[$i]=65276; | |
} | |
} | |
#65247 + 65156 | |
#lam initial + alef medial | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if (($letters_rev[$i]==65247 || $letters_rev[$i]==65248) and $letters_rev[$i+1]==65156){ | |
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev; | |
$letters_rev[$i]=65271; | |
} | |
} | |
#lam initial + alef hamza medial | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if (($letters_rev[$i]==65247 || $letters_rev[$i]==65248) and $letters_rev[$i+1]==65154){ | |
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev; | |
$letters_rev[$i]=65269; | |
} | |
} | |
#hamza_nabira_proceeded_by_isolated_letter | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if (($letters_rev[$i]==1569) and $letters_rev[$i+1]==65166){ | |
if($letters_rev[$i-1]~~@isolated_letters){ $letters_rev[$i]=65163; } | |
} | |
} | |
#hamza_nabira_followed_by_alef_or_isolated_Taa | |
for ($i=0;$i<scalar(@letters_rev);$i++){ | |
if ($letters_rev[$i]==65163){ | |
if ($letters_rev[$i+1]==65172){$letters_rev[$i]=1569;$letters_rev[$i+1]=1577} | |
if ($letters_rev[$i+1]==65166){$letters_rev[$i]=1569;$letters_rev[$i+1]=1575} | |
} | |
} | |
#for ($i=0;$i<scalar(@letters_rev);$i++){ | |
#if ($letters_rev[$i]==65152 and (!defined $letters_rev[$i+1])){ | |
#if($letters_rev[$i-1]~~@connect_to_short){ $letters_rev[$i-1]=$letters_rev[$i-1]-2} | |
#} | |
#} | |
################################# | |
@letters_rev=reverse(@letters_rev); | |
foreach(@letters_rev){ | |
print FILE chr($_); | |
#print "$_\n"; | |
#print " "; | |
} | |
} | |
@paragraph; | |
foreach(@lines){ | |
if ($_=~ m/^\d/) { | |
print FILE "$_" | |
}#\n" | |
else { | |
##THE FOLLOWING REG EXPRESSIONS USED TO REMOVE FONT AND POSITION TAGS USED USUALLY IN SUBS | |
##THE LAST TWO USED TO ADD A SPACE AFTER EACH PUNCTUATION SIGN , | |
#$_=~ s/(\()(\w*) (\w*)(\))/$1 $2 $3 $4/; | |
$_=~ s/\\/m/g; | |
$_=~ s|<.+?>||g; | |
$_=~ s/ـ/ـ /g; | |
##this removes some special characters cant be handles by the script, you can add more between the two brackets. | |
$_=~ s/[®°±²³´µ¶·¸¹»¼½¾?¢£¤¥¦§¨©«¬¯]//g; | |
$_=~ s/\{mpos\(\d{3},\d{3}\)\}//g; | |
$_=~ s/[\(\)-]//g; | |
$_=~ s/ّ|َ|ً|ُ|ٌ|ِ|ٍَّ|ًّ|ُّ|ٌّ|ِّ|ٍّ//g; | |
$_ =~ s/<font[^>]*>//g; | |
$_ =~s/(\w*)([-,.،])/$1 $2/g; | |
$_ =~s/([\)\(-,.،؟\'\"])(\w*)/$1 $2/g; | |
$_ =~s/(\w*)([\?\'\"\!\<\>\(\)؟])/$1 $2/g; | |
@sent=split(" ", $_); | |
@sent=reverse(@sent); | |
foreach(@sent){ | |
convertword2univalues($_) ; | |
print FILE " "; | |
} | |
print FILE "\n" | |
} | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment