175 lines
6.9 KiB
PHP
175 lines
6.9 KiB
PHP
<?php
|
|
namespace I18N;
|
|
|
|
class Arabic {
|
|
private $presentation_forms = [
|
|
'0621' => ['FE80', 'FE80', 'FE80', 'FE80'], // Hamza
|
|
'0622' => ['FE81', 'FE82', 'FE81', 'FE82'], // Alef with Madda
|
|
'0623' => ['FE83', 'FE84', 'FE83', 'FE84'], // Alef with Hamza Above
|
|
'0624' => ['FE85', 'FE86', 'FE85', 'FE86'], // Waw with Hamza Above
|
|
'0625' => ['FE87', 'FE88', 'FE87', 'FE88'], // Alef with Hamza Below
|
|
'0626' => ['FE89', 'FE8A', 'FE8B', 'FE8C'], // Yeh with Hamza Above
|
|
'0627' => ['FE8D', 'FE8E', 'FE8D', 'FE8E'], // Alef
|
|
'0628' => ['FE8F', 'FE90', 'FE91', 'FE92'], // Beh
|
|
'0629' => ['FE93', 'FE94', 'FE93', 'FE94'], // Teh Marbuta
|
|
'062A' => ['FE95', 'FE96', 'FE97', 'FE98'], // Teh
|
|
'062B' => ['FE99', 'FE9A', 'FE9B', 'FE9C'], // Theh
|
|
'062C' => ['FE9D', 'FE9E', 'FE9F', 'FEA0'], // Jeem
|
|
'062D' => ['FEA1', 'FEA2', 'FEA3', 'FEA4'], // Hah
|
|
'062E' => ['FEA5', 'FEA6', 'FEA7', 'FEA8'], // Khah
|
|
'062F' => ['FEA9', 'FEAA', 'FEA9', 'FEAA'], // Dal
|
|
'0630' => ['FEAB', 'FEAC', 'FEAB', 'FEAC'], // Thal
|
|
'0631' => ['FEAD', 'FEAE', 'FEAD', 'FEAE'], // Reh
|
|
'0632' => ['FEAF', 'FEB0', 'FEAF', 'FEB0'], // Zain
|
|
'0633' => ['FEB1', 'FEB2', 'FEB3', 'FEB4'], // Seen
|
|
'0634' => ['FEB5', 'FEB6', 'FEB7', 'FEB8'], // Sheen
|
|
'0635' => ['FEB9', 'FEBA', 'FEBB', 'FEBC'], // Sad
|
|
'0636' => ['FEBD', 'FEBE', 'FEBF', 'FEC0'], // Dad
|
|
'0637' => ['FEC1', 'FEC2', 'FEC3', 'FEC4'], // Tah
|
|
'0638' => ['FEC5', 'FEC6', 'FEC7', 'FEC8'], // Zah
|
|
'0639' => ['FEC9', 'FECA', 'FECB', 'FECC'], // Ain
|
|
'063A' => ['FECD', 'FECE', 'FECF', 'FED0'], // Ghain
|
|
'0640' => ['0640', '0640', '0640', '0640'], // Tatweel
|
|
'0641' => ['FED1', 'FED2', 'FED3', 'FED4'], // Feh
|
|
'0642' => ['FED5', 'FED6', 'FED7', 'FED8'], // Qaf
|
|
'0643' => ['FED9', 'FEDA', 'FEDB', 'FEDC'], // Kaf
|
|
'0644' => ['FEDD', 'FEDE', 'FEDF', 'FEE0'], // Lam
|
|
'0645' => ['FEE1', 'FEE2', 'FEE3', 'FEE4'], // Meem
|
|
'0646' => ['FEE5', 'FEE6', 'FEE7', 'FEE8'], // Noon
|
|
'0647' => ['FEE9', 'FEEA', 'FEEB', 'FEEC'], // Heh
|
|
'0648' => ['FEED', 'FEEE', 'FEED', 'FEEE'], // Waw
|
|
'0649' => ['FEEF', 'FEF0', 'FEEF', 'FEF0'], // Alef Maksura
|
|
'064A' => ['FEF1', 'FEF2', 'FEF3', 'FEF4'], // Yeh
|
|
];
|
|
|
|
private $connects_before = [
|
|
'0626','0628','0629','062A','062B','062C','062D','062E',
|
|
'0633','0634','0635','0636','0637','0638','0639','063A','0640','0641','0642',
|
|
'0643','0644','0645','0646','0647','064A'
|
|
];
|
|
|
|
// Letters that prevent the *next* letter from connecting to *this* one (Right-joining only)
|
|
private $disconnects_after = [
|
|
'0621','0622','0623','0624','0625','0627','062F','0630','0631','0632','0648','0649'
|
|
];
|
|
|
|
public function utf8Glyphs($str) {
|
|
$hex = $this->utf8ToHexArray($str);
|
|
$res = [];
|
|
$len = count($hex);
|
|
|
|
for ($i = 0; $i < $len; $i++) {
|
|
$current = $hex[$i];
|
|
|
|
// Skip non-Arabic chars (or spaces) in shaping logic
|
|
if (!isset($this->presentation_forms[$current])) {
|
|
$res[] = $current;
|
|
continue;
|
|
}
|
|
|
|
// Check for Lam-Alef Ligature
|
|
if ($current == '0644' && $i < $len - 1) {
|
|
$next = $hex[$i+1];
|
|
$ligature = null;
|
|
switch($next) {
|
|
case '0622': $ligature = ['FEF5', 'FEF6', 'FEF5', 'FEF6']; break; // Lam-Alef Madda
|
|
case '0623': $ligature = ['FEF7', 'FEF8', 'FEF7', 'FEF8']; break; // Lam-Alef Hamza Above
|
|
case '0625': $ligature = ['FEF9', 'FEFA', 'FEF9', 'FEFA']; break; // Lam-Alef Hamza Below
|
|
case '0627': $ligature = ['FEFB', 'FEFC', 'FEFB', 'FEFC']; break; // Lam-Alef
|
|
}
|
|
|
|
if ($ligature) {
|
|
// Determine connection from previous
|
|
$prev = $i > 0 ? $hex[$i-1] : null;
|
|
$connect_prev = $prev && $this->canConnect($prev);
|
|
|
|
if ($connect_prev) {
|
|
$res[] = $ligature[1]; // Final form of ligature
|
|
} else {
|
|
$res[] = $ligature[0]; // Isolated form of ligature
|
|
}
|
|
$i++; // Skip the Alef
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Normal Logic
|
|
$prev = $i > 0 ? $hex[$i-1] : null;
|
|
$next = $i < $len - 1 ? $hex[$i+1] : null;
|
|
|
|
$connect_prev = $prev && $this->canConnect($prev);
|
|
|
|
// For next connection, we check if current can connect left AND next can connect right
|
|
$connect_next = $next && $this->canConnectLeft($current) && $this->isArabic($next);
|
|
|
|
// Refined Logic
|
|
if ($connect_prev && $connect_next) {
|
|
$form = 3; // Medial
|
|
} elseif ($connect_prev) {
|
|
$form = 1; // Final
|
|
} elseif ($connect_next) {
|
|
$form = 2; // Initial
|
|
} else {
|
|
$form = 0; // Isolated
|
|
}
|
|
|
|
$res[] = $this->presentation_forms[$current][$form];
|
|
}
|
|
|
|
$s = $this->hexArrayToUtf8($res);
|
|
return $this->mb_strrev($s);
|
|
}
|
|
|
|
private function canConnect($hex) {
|
|
// Can $hex connect to the *next* letter?
|
|
// True if it is Arabic and NOT in disconnects_after
|
|
return isset($this->presentation_forms[$hex]) && !in_array($hex, $this->disconnects_after) && $hex != '0621'; // Hamza doesn't connect
|
|
}
|
|
|
|
private function canConnectLeft($hex) {
|
|
// Can $hex connect to the *next* letter? (Same as above)
|
|
return $this->canConnect($hex);
|
|
}
|
|
|
|
private function isArabic($hex) {
|
|
return isset($this->presentation_forms[$hex]);
|
|
}
|
|
|
|
private function utf8ToHexArray($str) {
|
|
$out = [];
|
|
$len = mb_strlen($str, 'UTF-8');
|
|
for($i=0; $i<$len; $i++) {
|
|
$c = mb_substr($str, $i, 1, 'UTF-8');
|
|
$val = $this->uniord($c);
|
|
$out[] = sprintf("%04X", $val);
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
private function hexArrayToUtf8($arr) {
|
|
$str = '';
|
|
foreach ($arr as $hex) {
|
|
$val = hexdec($hex);
|
|
$str .= $this->unichr($val);
|
|
}
|
|
return $str;
|
|
}
|
|
|
|
private function uniord($u) {
|
|
if (strlen($u) === 1) return ord($u);
|
|
$k = mb_convert_encoding($u, 'UCS-2LE', 'UTF-8');
|
|
$k1 = ord(substr($k, 0, 1));
|
|
$k2 = ord(substr($k, 1, 1));
|
|
return $k2 * 256 + $k1;
|
|
}
|
|
|
|
private function unichr($u) {
|
|
return mb_convert_encoding(pack("n", $u), 'UTF-8', 'UCS-2BE');
|
|
}
|
|
|
|
private function mb_strrev($str){
|
|
preg_match_all('/./us', $str, $ar);
|
|
return join('', array_reverse($ar[0]));
|
|
}
|
|
}
|