Skip to content

Instantly share code, notes, and snippets.

@abjdiat
Last active December 21, 2015 23:38
Show Gist options
  • Save abjdiat/6383492 to your computer and use it in GitHub Desktop.
Save abjdiat/6383492 to your computer and use it in GitHub Desktop.
PS3 Friendly Arabic Subs Converter Perl script to convert UTF-8 encoded Arabic subtitles to Ps3-Friendly format for scn shots and examples ,visit http://abjdiaty.blogspot.com/2012/01/ps3-friendly-arabic-subs-converter.html
#!/usr/bin/perl
#changelog http://abjdiaty.blogspot.com/2012/01/ps3-friendly-arabic-subs-converter.html
#email [email protected]
use utf8;
no warnings;
open IN, "<:encoding(utf-8)", $ARGV[0];
open OUT, ">:encoding(utf-8)", temp.srt;
while (<IN>) {
print OUT
}
$new_name=$ARGV[0];
$new_name=~ s/(.+)\.[^.]+$/$1/;
$new_name.="_ps3.srt";
open FILE2,"<:utf8", temp.srt or die $!;
open (FILE,">$new_name");
my @lines = <FILE2>;
sub shiftrate{
$_[0]=$_[0]-$_[1];
}
#this is array of arrays, each arabic letter has its own array, bcoz arabic letter has many positions within the word,
#36 arrays, each has 5 values , we used the unicode value of the seperated letter as a reference
####################################################
#is this arabic word? , we tell by knowing the unicode value of the first letters, later on , i might scan the whole word, but i think it's @ the moment, thstre first letter is enough
sub isItArabic{
@array1=(1563..1791,65267,65266);
@ar_word=split('',$_[0]);
#we use this sub to judge wether this word starts with arabic letter or not based on its unicode value
foreach(@ar_word){
if (ord($_) ~~ @array1){
return "true";
#the sub returns true if the condition is met, nothing otherwise
}#end of if
}#end of foreach
}#end of sub
################################################
#this sub used to strip out harakat out of words, so we can analyse the proper postition of each letter
#1567=?
sub clean_the_ar_word{
$letters=$_[0];
@letters=split('',$letters);
foreach(@letters){
$_=ord($_);
}
for ( $i=0;$i<scalar(@letters);$i++)
{
($letters[$i] ~~ @array1)?$letters[$i]=$letters[$i]:splice @letters, $i, 1;
}
return @letters;
}
###############################################
#@arabic_glyphs[1632]=([1632,1632,1632,1632,1632,3]);
#@arabic_glyphs[1633]=([1633,1633,1633,1633,1633,3]);
#@arabic_glyphs[1634]=([1634,1634,1634,1634,1634,3]);
#@arabic_glyphs[1635]=([1635,1635,1635,1635,1635,3]);
#@arabic_glyphs[1636]=([1636,1636,1636,1636,1636,3]);
#@arabic_glyphs[1637]=([1637,1637,1637,1637,1637,3]);
#@arabic_glyphs[1638]=([1638,1638,1638,1638,1638,3]);
#@arabic_glyphs[1639]=([1639,1639,1639,1639,1639,3]);
#@arabic_glyphs[1640]=([1640,1640,1640,1640,1640,3]);
#@arabic_glyphs[1641]=([1641,1641,1641,1641,1641,3]);
@arabic_glyphs[1569]=([1569,65152,65163,65164,65152,3]);
@arabic_glyphs[1570]=([1570,65153,65153,65154,65154,2 ]);
@arabic_glyphs[1571]=([1571,65155,65155,65156,65156,2 ]);
@arabic_glyphs[1572]=([1572,65157,65157,65158,65158,2 ]);
@arabic_glyphs[1573]=([1573,65159,65159,65160,65160,2]);
@arabic_glyphs[1575]=([1575,65165,65165,65166,65166,2]);
@arabic_glyphs[1576]=([1576,65167,65169,65170,65168,4]);
@arabic_glyphs[1577]=([1577,65171,65171,65172,65172,2 ]);
@arabic_glyphs[1578]=([1578,65173,65175,65176,65174,4]);
@arabic_glyphs[1579]=([1579,65177,65179,65180,65178,4 ]);
@arabic_glyphs[1580]=([1580,65181,65183,65184,65182,4 ]);
@arabic_glyphs[1581]=([1581,65185,65187,65188,65186,4 ]);
@arabic_glyphs[1582]=([1582,65189,65191,65192,65190,4 ]);
@arabic_glyphs[1583]=([1583,65193,65193,65194,65194,2]);
@arabic_glyphs[1584]=([1584,65195,65195,65196,65196,2]);
@arabic_glyphs[1585]=([1585,65197,65197,65198,65198,2 ]);
@arabic_glyphs[1586]=([1586,65199,65199,65200,65200,2 ]);
@arabic_glyphs[1587]=([1587,65201,65203,65204,65202,4]);
@arabic_glyphs[45]=([45,45,45,45,45,4]);
@arabic_glyphs[1588]=([1588,65205,65207,65208,65206,4]);
@arabic_glyphs[1589]=([1589,65209,65211,65212,65210,4 ]);
@arabic_glyphs[1590]=([1590,65213,65215,65216,65214,4]);
@arabic_glyphs[1591]=([1591,65217,65219,65218,65220,4]);
@arabic_glyphs[1592]=([1592,65221,65223,65222,65222,4]);
@arabic_glyphs[1593]=([ 1593,65225,65227,65228,65226,4]);
@arabic_glyphs[1594]=([1594,65229,65231,65232,65230,4]);
@arabic_glyphs[1601]=([1601,65233,65235,65236,65234,4]);
@arabic_glyphs[1602]=([1602,65237,65239,65240,65238,4]);
@arabic_glyphs[1603]=([1603,65241,65243,65244,65242,4]);
@arabic_glyphs[1604]=([1604,65245,65247,65248,65246,4]);
@arabic_glyphs[1605]=([1605,65249,65251,65252,65250,4]);
@arabic_glyphs[1606]=([1606,65253,65255,65256,65254,4]);
@arabic_glyphs[1607]=([1607,65257,65259,65260,65258,4]);
@arabic_glyphs[1608]=([1608,65261,65261,65262,65262,2]);
@arabic_glyphs[1609]=([1609,65263,65263,65264,65264,2]);
@arabic_glyphs[1610]=([1610,65265,65267,65268,65266,4]);
@arabic_glyphs[1574]=([1574,65161,65163,65164,65162,2 ]);
#?
@arabic_glyphs[1567]=([1567,1567,1567,1567,1567,1 ]);
#Arabic Letter Jeh
@arabic_glyphs[1688]=([1688,64394,64394,64395,64395,2]);
#Arabic Letter Tcheh
@arabic_glyphs[1670]=([1670,64378,64380,64381,64379,4]);
#letter Peh. initial, middle, final
@arabic_glyphs[1662]=([1662,64342,64344,64345,64343,4]);
#Arabic Letter Swash Kaf
@arabic_glyphs[1705]=([1705,64398,64400,64401,64399,4]);
#Arabic Letter Gaf
@arabic_glyphs[1711]=([1711,64402,64404,64405,64403,4]);
@arabic_glyphs[1740]=([1740,64508,64510,64511,64509,4]);
@arabic_glyphs[1728]=([172,64420,64510,64420,64421,4]);
#dividing the words into letters, each has its own position, needed later to convert the exact postition of the letters in it's correct unicode value
#i already converted the word into sequence of items (array), so the loop here has numerator based upon we give the letter it's position value, first=1, middle=2, #last=3
sub convertword2univalues{
@letters_rev=();
if (isItArabic($_[0])){
@letters=clean_the_ar_word($_[0]);
#@letters=split('',$_[0]);
for ($i=0;$i<scalar(@letters);$i++){
if ($i==0){
$letters[$i]=$letters[$i];
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][2])}
}
if ($i!=0 and $i !=scalar(@letters)-1) {
$letters[$i]=$letters[$i];
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][3])};
}
if ($i==scalar(@letters)-1 and $i>0){
$letters[$i]=$letters[$i];
if($letters[$i] ~~ $arabic_glyphs[$letters[$i]]){ push (@letters_rev,$arabic_glyphs[$letters[$i]][4]);
}
}
}}
else {
#$_=~ s/([0-9]*)([0-9]*)/$1 $2/g;
$_=~ s/0/٠‎/g;
$_=~ s/1/١/g;
$_=~ s/2/٢‎/g;
$_=~ s/3/٣/g;
$_=~ s/4/٤/g;
$_=~ s/5/٥/g;
$_=~ s/6/٦‎/g;
$_=~ s/7/٧‎/g;
$_=~ s/8/٨/g;
$_=~ s/9/٩/g;
$_=~ s/%/٪/g;
$_=~ s/\?/؟/g;
print FILE "$_" ;
}
@isolated_letters=(65199,65153,65166,65200,65261,64394,64395,65193,65194,65195,65196,65197,65198,65157,65158,65262,65155,65154,65165,65159,65160,65166,65156);
@connect_to_short=(65170,65176,65180,65184,65188,65192,65204,65262,65208,65212,65216,65220,65224,65228,65232,65236,65240,
65244,65248,65252,65256,65260,65268,65198,65166,65194,65172,65254,65250,65246,65242,65238,
65234,65230,65226,65222,65218,65214,65210,65174,65206,65202,65200,65198,65196,65194,65190,
65186,65182,65178,65168,65160,65162,65158,65156,65154,65258,65266,65264,65164,64343,64345,64401,64399,64405,64403,64395,64395,64381,64379,64511,64509,64421);
#rm
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i] ~~ @isolated_letters )
{
if ($letters_rev[$i+1] ~~ @connect_to_short)
{shiftrate($letters_rev[$i+1],+1);
}}
}
#hamza_middle_preceeded_by_mad
for ($i=0;$i<scalar(@letters_rev);$i++){
if (($letters_rev[$i]==65163) and $letters_rev[$i-1]~~ @isolated_letters and ($letters_rev[$i+1]~~@isolated_letters )){
$letters_rev[$i]=1569;
}
}
#ARABIC LETTER ALEF ISOLATED FORM+ARABIC LETTER LAM MEDIAL FORM
# الـ ال
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65165 and $letters_rev[$i+1]==65248){
shiftrate($letters_rev[$i+1],+1);
}
}
#ARABIC LETTER YEH MEDIAL FORM+ARABIC LETTER HAMZA ISOLATED FORM
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65268 ){
if($letters_rev[$i+1]==65152){
shiftrate($letters_rev[$i],+2);
}
if ($letters_rev[$i+1]==65163){
shiftrate($letters_rev[$i+1],-1);
}
}
}
#ARABIC LETTER ALEF FINAL+ARABIC LETTER LAM MEDIAL FORM
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65166 and $letters_rev[$i+1]==65248){
shiftrate($letters_rev[$i+1],+1);
}
}
#special letters: ﻵ ﻶ ﻷ ﻸ ﻹ ﻺ ﻻ ﻼ
##special characters, these characters special bcoz they are displayed as one glyph while perl interpret them as two
#for ($i=0;$i<scalar(@letters_rev);$i++){
#if ($letters_rev[$i]==65267 and $letters_rev[$i+1]==65266){
#$letters_rev[$i]=65271;
#}}
#alef_lam_middle_correction
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65166 and $letters_rev[$i+1]==65248){
shiftrate($letters_rev[$i+1],+1);
}
}
#arabic letter lam initial form + arabic letter alef final form
#لـا
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65247 and $letters_rev[$i+1]==65166){
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev;
$letters_rev[$i]=65275;
}
}
#arabic letter lam middle form + arabic letter final form
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65248 and $letters_rev[$i+1]==65166){
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev;
$letters_rev[$i]=65276;
}
}
#65247 + 65156
#lam initial + alef medial
for ($i=0;$i<scalar(@letters_rev);$i++){
if (($letters_rev[$i]==65247 || $letters_rev[$i]==65248) and $letters_rev[$i+1]==65156){
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev;
$letters_rev[$i]=65271;
}
}
#lam initial + alef hamza medial
for ($i=0;$i<scalar(@letters_rev);$i++){
if (($letters_rev[$i]==65247 || $letters_rev[$i]==65248) and $letters_rev[$i+1]==65154){
@letters_rev = grep { $_ != $letters_rev[$i+1] } @letters_rev;
$letters_rev[$i]=65269;
}
}
#hamza_nabira_proceeded_by_isolated_letter
for ($i=0;$i<scalar(@letters_rev);$i++){
if (($letters_rev[$i]==1569) and $letters_rev[$i+1]==65166){
if($letters_rev[$i-1]~~@isolated_letters){ $letters_rev[$i]=65163; }
}
}
#hamza_nabira_followed_by_alef_or_isolated_Taa
for ($i=0;$i<scalar(@letters_rev);$i++){
if ($letters_rev[$i]==65163){
if ($letters_rev[$i+1]==65172){$letters_rev[$i]=1569;$letters_rev[$i+1]=1577}
if ($letters_rev[$i+1]==65166){$letters_rev[$i]=1569;$letters_rev[$i+1]=1575}
}
}
#for ($i=0;$i<scalar(@letters_rev);$i++){
#if ($letters_rev[$i]==65152 and (!defined $letters_rev[$i+1])){
#if($letters_rev[$i-1]~~@connect_to_short){ $letters_rev[$i-1]=$letters_rev[$i-1]-2}
#}
#}
#################################
@letters_rev=reverse(@letters_rev);
foreach(@letters_rev){
print FILE chr($_);
#print "$_\n";
#print " ";
}
}
@paragraph;
foreach(@lines){
if ($_=~ m/^\d/) {
print FILE "$_"
}#\n"
else {
##THE FOLLOWING REG EXPRESSIONS USED TO REMOVE FONT AND POSITION TAGS USED USUALLY IN SUBS
##THE LAST TWO USED TO ADD A SPACE AFTER EACH PUNCTUATION SIGN ,
#$_=~ s/(\()(\w*) (\w*)(\))/$1 $2 $3 $4/;
$_=~ s/\\/m/g;
$_=~ s|<.+?>||g;
$_=~ s/ـ/ـ /g;
##this removes some special characters cant be handles by the script, you can add more between the two brackets.
$_=~ s/[®°±²³´µ¶·¸¹»¼½¾?¢£¤¥¦§¨©«¬­¯]//g;
$_=~ s/\{mpos\(\d{3},\d{3}\)\}//g;
$_=~ s/[\(\)-]//g;
$_=~ s/ّ|َ|ً|ُ|ٌ|ِ|ٍَّ|ًّ|ُّ|ٌّ|ِّ|ٍّ//g;
$_ =~ s/<font[^>]*>//g;
$_ =~s/(\w*)([-,.،])/$1 $2/g;
$_ =~s/([\)\(-,.،؟\'\"])(\w*)/$1 $2/g;
$_ =~s/(\w*)([\?\'\"\!\<\>\(\)؟])/$1 $2/g;
@sent=split(" ", $_);
@sent=reverse(@sent);
foreach(@sent){
convertword2univalues($_) ;
print FILE " ";
}
print FILE "\n"
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment