38394-vm/includes/I18N/Arabic.php
2026-02-14 07:56:12 +00:00

175 lines
6.9 KiB
PHP

<?php
namespace I18N;
class Arabic {
private $presentation_forms = [
'0621' => ['FE80', 'FE80', 'FE80', 'FE80'], // Hamza
'0622' => ['FE81', 'FE82', 'FE81', 'FE82'], // Alef with Madda
'0623' => ['FE83', 'FE84', 'FE83', 'FE84'], // Alef with Hamza Above
'0624' => ['FE85', 'FE86', 'FE85', 'FE86'], // Waw with Hamza Above
'0625' => ['FE87', 'FE88', 'FE87', 'FE88'], // Alef with Hamza Below
'0626' => ['FE89', 'FE8A', 'FE8B', 'FE8C'], // Yeh with Hamza Above
'0627' => ['FE8D', 'FE8E', 'FE8D', 'FE8E'], // Alef
'0628' => ['FE8F', 'FE90', 'FE91', 'FE92'], // Beh
'0629' => ['FE93', 'FE94', 'FE93', 'FE94'], // Teh Marbuta
'062A' => ['FE95', 'FE96', 'FE97', 'FE98'], // Teh
'062B' => ['FE99', 'FE9A', 'FE9B', 'FE9C'], // Theh
'062C' => ['FE9D', 'FE9E', 'FE9F', 'FEA0'], // Jeem
'062D' => ['FEA1', 'FEA2', 'FEA3', 'FEA4'], // Hah
'062E' => ['FEA5', 'FEA6', 'FEA7', 'FEA8'], // Khah
'062F' => ['FEA9', 'FEAA', 'FEA9', 'FEAA'], // Dal
'0630' => ['FEAB', 'FEAC', 'FEAB', 'FEAC'], // Thal
'0631' => ['FEAD', 'FEAE', 'FEAD', 'FEAE'], // Reh
'0632' => ['FEAF', 'FEB0', 'FEAF', 'FEB0'], // Zain
'0633' => ['FEB1', 'FEB2', 'FEB3', 'FEB4'], // Seen
'0634' => ['FEB5', 'FEB6', 'FEB7', 'FEB8'], // Sheen
'0635' => ['FEB9', 'FEBA', 'FEBB', 'FEBC'], // Sad
'0636' => ['FEBD', 'FEBE', 'FEBF', 'FEC0'], // Dad
'0637' => ['FEC1', 'FEC2', 'FEC3', 'FEC4'], // Tah
'0638' => ['FEC5', 'FEC6', 'FEC7', 'FEC8'], // Zah
'0639' => ['FEC9', 'FECA', 'FECB', 'FECC'], // Ain
'063A' => ['FECD', 'FECE', 'FECF', 'FED0'], // Ghain
'0640' => ['0640', '0640', '0640', '0640'], // Tatweel
'0641' => ['FED1', 'FED2', 'FED3', 'FED4'], // Feh
'0642' => ['FED5', 'FED6', 'FED7', 'FED8'], // Qaf
'0643' => ['FED9', 'FEDA', 'FEDB', 'FEDC'], // Kaf
'0644' => ['FEDD', 'FEDE', 'FEDF', 'FEE0'], // Lam
'0645' => ['FEE1', 'FEE2', 'FEE3', 'FEE4'], // Meem
'0646' => ['FEE5', 'FEE6', 'FEE7', 'FEE8'], // Noon
'0647' => ['FEE9', 'FEEA', 'FEEB', 'FEEC'], // Heh
'0648' => ['FEED', 'FEEE', 'FEED', 'FEEE'], // Waw
'0649' => ['FEEF', 'FEF0', 'FEEF', 'FEF0'], // Alef Maksura
'064A' => ['FEF1', 'FEF2', 'FEF3', 'FEF4'], // Yeh
];
private $connects_before = [
'0626','0628','0629','062A','062B','062C','062D','062E',
'0633','0634','0635','0636','0637','0638','0639','063A','0640','0641','0642',
'0643','0644','0645','0646','0647','064A'
];
// Letters that prevent the *next* letter from connecting to *this* one (Right-joining only)
private $disconnects_after = [
'0621','0622','0623','0624','0625','0627','062F','0630','0631','0632','0648','0649'
];
public function utf8Glyphs($str) {
$hex = $this->utf8ToHexArray($str);
$res = [];
$len = count($hex);
for ($i = 0; $i < $len; $i++) {
$current = $hex[$i];
// Skip non-Arabic chars (or spaces) in shaping logic
if (!isset($this->presentation_forms[$current])) {
$res[] = $current;
continue;
}
// Check for Lam-Alef Ligature
if ($current == '0644' && $i < $len - 1) {
$next = $hex[$i+1];
$ligature = null;
switch($next) {
case '0622': $ligature = ['FEF5', 'FEF6', 'FEF5', 'FEF6']; break; // Lam-Alef Madda
case '0623': $ligature = ['FEF7', 'FEF8', 'FEF7', 'FEF8']; break; // Lam-Alef Hamza Above
case '0625': $ligature = ['FEF9', 'FEFA', 'FEF9', 'FEFA']; break; // Lam-Alef Hamza Below
case '0627': $ligature = ['FEFB', 'FEFC', 'FEFB', 'FEFC']; break; // Lam-Alef
}
if ($ligature) {
// Determine connection from previous
$prev = $i > 0 ? $hex[$i-1] : null;
$connect_prev = $prev && $this->canConnect($prev);
if ($connect_prev) {
$res[] = $ligature[1]; // Final form of ligature
} else {
$res[] = $ligature[0]; // Isolated form of ligature
}
$i++; // Skip the Alef
continue;
}
}
// Normal Logic
$prev = $i > 0 ? $hex[$i-1] : null;
$next = $i < $len - 1 ? $hex[$i+1] : null;
$connect_prev = $prev && $this->canConnect($prev);
// For next connection, we check if current can connect left AND next can connect right
$connect_next = $next && $this->canConnectLeft($current) && $this->isArabic($next);
// Refined Logic
if ($connect_prev && $connect_next) {
$form = 3; // Medial
} elseif ($connect_prev) {
$form = 1; // Final
} elseif ($connect_next) {
$form = 2; // Initial
} else {
$form = 0; // Isolated
}
$res[] = $this->presentation_forms[$current][$form];
}
$s = $this->hexArrayToUtf8($res);
return $this->mb_strrev($s);
}
private function canConnect($hex) {
// Can $hex connect to the *next* letter?
// True if it is Arabic and NOT in disconnects_after
return isset($this->presentation_forms[$hex]) && !in_array($hex, $this->disconnects_after) && $hex != '0621'; // Hamza doesn't connect
}
private function canConnectLeft($hex) {
// Can $hex connect to the *next* letter? (Same as above)
return $this->canConnect($hex);
}
private function isArabic($hex) {
return isset($this->presentation_forms[$hex]);
}
private function utf8ToHexArray($str) {
$out = [];
$len = mb_strlen($str, 'UTF-8');
for($i=0; $i<$len; $i++) {
$c = mb_substr($str, $i, 1, 'UTF-8');
$val = $this->uniord($c);
$out[] = sprintf("%04X", $val);
}
return $out;
}
private function hexArrayToUtf8($arr) {
$str = '';
foreach ($arr as $hex) {
$val = hexdec($hex);
$str .= $this->unichr($val);
}
return $str;
}
private function uniord($u) {
if (strlen($u) === 1) return ord($u);
$k = mb_convert_encoding($u, 'UCS-2LE', 'UTF-8');
$k1 = ord(substr($k, 0, 1));
$k2 = ord(substr($k, 1, 1));
return $k2 * 256 + $k1;
}
private function unichr($u) {
return mb_convert_encoding(pack("n", $u), 'UTF-8', 'UCS-2BE');
}
private function mb_strrev($str){
preg_match_all('/./us', $str, $ar);
return join('', array_reverse($ar[0]));
}
}