36782-vm/includes/pdfparser/Document/CMap/ToUnicode/ToUnicodeCMapParser.php
2026-01-09 07:13:59 +00:00

92 lines
5.2 KiB
PHP

<?php declare(strict_types=1);
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
use PrinsFrank\PdfParser\Exception\ParseFailureException;
use PrinsFrank\PdfParser\Exception\PdfParserException;
use PrinsFrank\PdfParser\Stream\Stream;
/** @internal */
class ToUnicodeCMapParser {
/** @throws PdfParserException */
public static function parse(Stream $stream, int $startOffset, int $nrOfBytes): ToUnicodeCMap {
$beginCodeSpaceRangePos = $stream->firstPos(ToUnicodeCMapOperator::BeginCodeSpaceRange, $startOffset, $startOffset + $nrOfBytes)
?? throw new ParseFailureException(sprintf('Missing %s', ToUnicodeCMapOperator::BeginCodeSpaceRange->value));
$beginCodeSpaceRangePos += strlen(ToUnicodeCMapOperator::BeginCodeSpaceRange->value);
$endCodeSpaceRangePos = $stream->firstPos(ToUnicodeCMapOperator::EndCodeSpaceRange, $beginCodeSpaceRangePos, $startOffset + $nrOfBytes)
?? throw new ParseFailureException();
$codeSpaceRangeSectionString = $stream->read($beginCodeSpaceRangePos, $endCodeSpaceRangePos - $beginCodeSpaceRangePos);
$codeSpaceRanges = [];
$byteSize = null;
foreach (explode("\n", $codeSpaceRangeSectionString) as $codeSpaceRangeSectionStringLine) {
if (trim($codeSpaceRangeSectionStringLine) === '') {
continue;
}
if (preg_match('/^\s*<\s*(?P<start>[0-9a-fA-F]+)\s*>\s*<\s*(?P<end>[0-9a-fA-F]+)\s*>\s*$/', $codeSpaceRangeSectionStringLine, $matchesSpaceRange) !== 1) {
throw new ParseFailureException('Unrecognized codespacerange format');
}
if (strlen($matchesSpaceRange['start']) !== strlen($matchesSpaceRange['end'])) {
throw new ParseFailureException(sprintf('Start(%s) and end(%s) of codespacerange don\'t have the same number of bytes', $matchesSpaceRange['start'], $matchesSpaceRange['end']));
}
if (($strlen = strlen($matchesSpaceRange['start'])) % 2 !== 0 || !is_int($byteSizeRange = $strlen / 2)) {
throw new ParseFailureException(sprintf('Codespaceranges must be an even number of hex digits, got %d', $strlen));
}
if ($byteSize !== null && $byteSizeRange !== $byteSize) {
throw new ParseFailureException(sprintf('Byte size of codespaceranges is inconsistent, expected %d, got %d', $byteSize, $byteSizeRange));
}
$byteSize = $byteSizeRange;
$codeSpaceRanges[] = new CodeSpaceRange((int) hexdec($matchesSpaceRange['start']), (int) hexdec($matchesSpaceRange['end']));
}
/** @var array<int, list<BFRange|BFChar>> $bfCharRangeInfo where the first index is used to track the position of the element in the CMap */
$bfCharRangeInfo = [];
$lastPos = $startOffset;
while (($beginBFCharPos = $stream->firstPos(ToUnicodeCMapOperator::BeginBFChar, $lastPos, $startOffset + $nrOfBytes)) !== null) {
$beginBFCharPos += strlen(ToUnicodeCMapOperator::BeginBFChar->value);
$endBFCharPos = $stream->firstPos(ToUnicodeCMapOperator::EndBFChar, $beginBFCharPos, $startOffset + $nrOfBytes)
?? throw new ParseFailureException();
if (preg_match_all('/\s*<(?P<source>[^>]+)>\s*<(?P<destination>[^>]+)>\s*/', $stream->read($beginBFCharPos, $endBFCharPos - $beginBFCharPos), $matchesBFChar, PREG_SET_ORDER) === 0) {
throw new ParseFailureException('Unrecognized bfchar format');
}
foreach ($matchesBFChar as $matchBFChar) {
$bfCharRangeInfo[$beginBFCharPos][] = new BFChar((int) hexdec(trim($matchBFChar['source'])), trim($matchBFChar['destination']));
}
$lastPos = $endBFCharPos;
}
$lastPos = $startOffset;
while (($beginBFRangePos = $stream->firstPos(ToUnicodeCMapOperator::BeginBFRange, $lastPos, $startOffset + $nrOfBytes)) !== null) {
$endBFRangePos = $stream->firstPos(ToUnicodeCMapOperator::EndBFRange, $beginBFRangePos, $startOffset + $nrOfBytes)
?? throw new ParseFailureException();
if (preg_match_all('/\s*<(?P<start>[^>]+)>\s*<(?P<end>[^>]+)>\s*(?P<targetString>(<[^>]+>)|(\[\s*(<[^>]+>\s*)+\]))/', $stream->read($beginBFRangePos, $endBFRangePos - $beginBFRangePos), $matchesBFRange, PREG_SET_ORDER) === 0) {
throw new ParseFailureException('Unrecognized bfrange format');
}
foreach ($matchesBFRange as $matchBFRange) {
$bfCharRangeInfo[$beginBFRangePos][] = new BFRange(
(int) hexdec(trim($matchBFRange['start'])),
(int) hexdec(trim($matchBFRange['end'])),
array_map(
fn (string $value) => trim($value),
explode('><', rtrim(ltrim(str_replace(' ', '', $matchBFRange['targetString']), '[<'), '>]'))
)
);
}
$lastPos = $endBFRangePos;
}
ksort($bfCharRangeInfo); // Make sure that Char and Range are in order they occur in the CMap
return new ToUnicodeCMap(
$codeSpaceRanges,
$byteSize !== null ? $byteSize : 2,
...array_merge(...$bfCharRangeInfo)
);
}
}