Drobne bug-fixes (maile i czytanie pdf dla GPT-api
This commit is contained in:
parent
595f1a1c17
commit
2823787d46
@ -1,5 +1,9 @@
|
||||
<?php
|
||||
require_once __DIR__ . '/../includes/init.php';
|
||||
require_once __DIR__ . '/../includes/auth.php';
|
||||
require_once __DIR__ . '/../includes/pdfparser_autoloader.php';
|
||||
|
||||
use \PrinsFrank\PdfParser\PdfParser;
|
||||
|
||||
require_admin();
|
||||
|
||||
@ -17,11 +21,12 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST') {
|
||||
$title = $_POST['title'];
|
||||
$content = $_POST['content'];
|
||||
$tags = $_POST['tags'] ?? null;
|
||||
$product_id = $_POST['product_id'] ?: null;
|
||||
$language = $_POST['language'];
|
||||
$is_active = isset($_POST['is_active']);
|
||||
$product_id = !empty($_POST['product_id']) ? $_POST['product_id'] : null;
|
||||
$language = $_POST['language'] ?? 'en';
|
||||
$is_active = isset($_POST['is_active']) ? 1 : 0;
|
||||
|
||||
$file_path = $document['file_path'] ?? null;
|
||||
$file_content = $document['file_content'] ?? null;
|
||||
|
||||
// Handle file removal
|
||||
if ($id && isset($_POST['remove_file']) && $_POST['remove_file'] == '1') {
|
||||
@ -29,6 +34,7 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST') {
|
||||
unlink(__DIR__ . '/../uploads/kb_documents/' . $file_path);
|
||||
}
|
||||
$file_path = null;
|
||||
$file_content = null;
|
||||
}
|
||||
|
||||
// Handle new file upload
|
||||
@ -53,16 +59,20 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST') {
|
||||
|
||||
if (move_uploaded_file($_FILES['pdf_file']['tmp_name'], $target_path)) {
|
||||
$file_path = $new_file_name;
|
||||
|
||||
$parser = new PdfParser();
|
||||
$pdf = $parser->parseFile($target_path);
|
||||
$file_content = $pdf->getText();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($id) {
|
||||
$stmt = db()->prepare("UPDATE kb_documents SET title = ?, content = ?, tags = ?, product_id = ?, language = ?, is_active = ?, file_path = ? WHERE id = ?");
|
||||
$stmt->execute([$title, $content, $tags, $product_id, $language, $is_active, $file_path, $id]);
|
||||
$stmt = db()->prepare("UPDATE kb_documents SET title = ?, content = ?, tags = ?, product_id = ?, language = ?, is_active = ?, file_path = ?, file_content = ? WHERE id = ?");
|
||||
$stmt->execute([$title, $content, $tags, $product_id, $language, $is_active, $file_path, $file_content, $id]);
|
||||
} else {
|
||||
$stmt = db()->prepare("INSERT INTO kb_documents (title, content, tags, product_id, language, is_active, file_path) VALUES (?, ?, ?, ?, ?, ?, ?)");
|
||||
$stmt->execute([$title, $content, $tags, $product_id, $language, $is_active, $file_path]);
|
||||
$stmt = db()->prepare("INSERT INTO kb_documents (title, content, tags, product_id, language, is_active, file_path, file_content) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
|
||||
$stmt->execute([$title, $content, $tags, $product_id, $language, $is_active, $file_path, $file_content]);
|
||||
}
|
||||
|
||||
header('Location: kb_documents.php');
|
||||
|
||||
1
admin/order_details_error.log
Normal file
1
admin/order_details_error.log
Normal file
@ -0,0 +1 @@
|
||||
[09-Jan-2026 07:03:16 UTC] PHP Warning: Undefined variable $pageTitle in /home/ubuntu/executor/workspace/admin/order_details.php on line 133
|
||||
@ -26,7 +26,7 @@ function search_kb($message) {
|
||||
$sql = "SELECT * FROM kb_documents WHERE is_active = 1 AND (";
|
||||
$conditions = [];
|
||||
foreach ($terms as $term) {
|
||||
$conditions[] = "title LIKE ? OR content LIKE ?";
|
||||
$conditions[] = "title LIKE ? OR content LIKE ? OR file_content LIKE ?";
|
||||
}
|
||||
$sql .= implode(' OR ', $conditions) . ") LIMIT 3";
|
||||
|
||||
@ -35,6 +35,7 @@ function search_kb($message) {
|
||||
foreach ($terms as $term) {
|
||||
$params[] = '%' . $term . '%';
|
||||
$params[] = '%' . $term . '%';
|
||||
$params[] = '%' . $term . '%';
|
||||
}
|
||||
$stmt->execute($params);
|
||||
return $stmt->fetchAll();
|
||||
@ -62,6 +63,9 @@ if (!empty($kb_documents)) {
|
||||
foreach ($kb_documents as $doc) {
|
||||
$system_prompt .= "- Title: " . $doc['title'] . "\n";
|
||||
$system_prompt .= " Content: " . $doc['content'] . "\n";
|
||||
if (!empty($doc['file_content'])) {
|
||||
$system_prompt .= " File Content: " . $doc['file_content'] . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
1
db/migrations/036_add_file_content_to_kb_documents.sql
Normal file
1
db/migrations/036_add_file_content_to_kb_documents.sql
Normal file
@ -0,0 +1 @@
|
||||
ALTER TABLE `kb_documents` ADD `file_content` TEXT NULL;
|
||||
1096
debug_price.log
1096
debug_price.log
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,20 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\Registry\Adobe;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\CMap\Registry\CMapResource;
|
||||
use PrinsFrank\PdfParser\Document\CMap\ToUnicode\BFRange;
|
||||
use PrinsFrank\PdfParser\Document\CMap\ToUnicode\CodeSpaceRange;
|
||||
use PrinsFrank\PdfParser\Document\CMap\ToUnicode\ToUnicodeCMap;
|
||||
|
||||
class Identity0 implements CMapResource {
|
||||
#[Override]
|
||||
public function getToUnicodeCMap(): ToUnicodeCMap {
|
||||
return new ToUnicodeCMap(
|
||||
[new CodeSpaceRange(0x0000, 0xFFFF)],
|
||||
2,
|
||||
new BFRange(0x0000, 0xFFFF, ['0000'])
|
||||
);
|
||||
}
|
||||
}
|
||||
10
includes/pdfparser/Document/CMap/Registry/CMapResource.php
Normal file
10
includes/pdfparser/Document/CMap/Registry/CMapResource.php
Normal file
@ -0,0 +1,10 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\Registry;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CMap\ToUnicode\ToUnicodeCMap;
|
||||
|
||||
interface CMapResource {
|
||||
/** @internal */
|
||||
public function getToUnicodeCMap(): ToUnicodeCMap;
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\Registry;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CMap\Registry\Adobe\Identity0;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Integer\IntegerValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\TextString\TextStringValue;
|
||||
|
||||
/** @internal */
|
||||
class RegistryOrchestrator {
|
||||
public static function getForRegistryOrderingSupplement(TextStringValue $registry, TextStringValue $ordering, IntegerValue $supplement): ?CMapResource {
|
||||
return match ([$registry->getText(), $ordering->getText(), $supplement->value]) {
|
||||
['Adobe', 'Identity', 0] => new Identity0(),
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
}
|
||||
27
includes/pdfparser/Document/CMap/ToUnicode/BFChar.php
Normal file
27
includes/pdfparser/Document/CMap/ToUnicode/BFChar.php
Normal file
@ -0,0 +1,27 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** @internal */
|
||||
class BFChar {
|
||||
public function __construct(
|
||||
public readonly int $sourceCode,
|
||||
public readonly string $destinationString,
|
||||
) {
|
||||
}
|
||||
|
||||
public function containsCharacterCode(int $characterCode): bool {
|
||||
return $characterCode === $this->sourceCode;
|
||||
}
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
public function toUnicode(int $characterCode): ?string {
|
||||
if ($characterCode !== $this->sourceCode) {
|
||||
throw new ParseFailureException(sprintf('This BFChar does not contain character code %d', $characterCode));
|
||||
}
|
||||
|
||||
return CodePoint::toString($this->destinationString);
|
||||
}
|
||||
}
|
||||
35
includes/pdfparser/Document/CMap/ToUnicode/BFRange.php
Normal file
35
includes/pdfparser/Document/CMap/ToUnicode/BFRange.php
Normal file
@ -0,0 +1,35 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** @internal */
|
||||
class BFRange {
|
||||
/** @param list<string> $destinationCodePoints */
|
||||
public function __construct(
|
||||
public readonly int $sourceCodeStart,
|
||||
public readonly int $sourceCodeEnd,
|
||||
public readonly array $destinationCodePoints,
|
||||
) {
|
||||
}
|
||||
|
||||
public function containsCharacterCode(int $characterCode): bool {
|
||||
return $characterCode >= $this->sourceCodeStart
|
||||
&& $characterCode <= $this->sourceCodeEnd;
|
||||
}
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
public function toUnicode(int $characterCode): ?string {
|
||||
if (count($this->destinationCodePoints) === 1) {
|
||||
return CodePoint::toString(
|
||||
dechex(((int) hexdec($this->destinationCodePoints[0])) + $characterCode - $this->sourceCodeStart),
|
||||
);
|
||||
}
|
||||
|
||||
return CodePoint::toString(
|
||||
$this->destinationCodePoints[$characterCode - $this->sourceCodeStart]
|
||||
?? throw new ParseFailureException(),
|
||||
);
|
||||
}
|
||||
}
|
||||
37
includes/pdfparser/Document/CMap/ToUnicode/CodePoint.php
Normal file
37
includes/pdfparser/Document/CMap/ToUnicode/CodePoint.php
Normal file
@ -0,0 +1,37 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
class CodePoint {
|
||||
/** $codepoint cannot be an int as multiple concatenated single bytes can be more than PHP_INT_MAX */
|
||||
public static function toString(string $hexString): string {
|
||||
if (!ctype_xdigit($hexString)) {
|
||||
throw new InvalidArgumentException(sprintf('Expected hex string, got "%s"', $hexString));
|
||||
}
|
||||
|
||||
$chars = [];
|
||||
for ($i = 0; $i < strlen($hexString);) {
|
||||
if (($highSurrogate = (($surrogateCodePoint = (int) hexdec(substr($hexString, $i, 8))) >> 16) & 0xFFFF) >= 0xD800
|
||||
&& $highSurrogate <= 0xDBFF
|
||||
&& ($lowSurrogate = $surrogateCodePoint & 0xFFFF) >= 0xDC00
|
||||
&& $lowSurrogate <= 0xDFFF) {
|
||||
$charCodepoint = (($highSurrogate - 0xD800) << 10) + ($lowSurrogate - 0xDC00) + 0x10000;
|
||||
$i += 8; // Surrogate Pairs are 4 bytes long
|
||||
} else {
|
||||
$charCodepoint = (int) hexdec(substr($hexString, $i, 4));
|
||||
$i += 4; // Non surrogate pairs are 2 bytes long
|
||||
}
|
||||
|
||||
if (($char = mb_chr($charCodepoint)) === false) {
|
||||
throw new ParseFailureException();
|
||||
}
|
||||
|
||||
$chars[] = $char;
|
||||
}
|
||||
|
||||
return implode('', $chars);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
class CodeSpaceRange {
|
||||
public function __construct(
|
||||
public readonly int $codeSpaceStart,
|
||||
public readonly int $codeSpaceEnd,
|
||||
) {
|
||||
}
|
||||
}
|
||||
64
includes/pdfparser/Document/CMap/ToUnicode/ToUnicodeCMap.php
Normal file
64
includes/pdfparser/Document/CMap/ToUnicode/ToUnicodeCMap.php
Normal file
@ -0,0 +1,64 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
|
||||
class ToUnicodeCMap {
|
||||
/** @var list<BFRange|BFChar> */
|
||||
private readonly array $bfCharRangeInfo;
|
||||
|
||||
/**
|
||||
* @no-named-arguments
|
||||
*
|
||||
* @param list<CodeSpaceRange> $codeSpaceRanges
|
||||
* @param int<1, max> $byteSize
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(
|
||||
public readonly array $codeSpaceRanges,
|
||||
public readonly int $byteSize,
|
||||
BFRange|BFChar ...$bfCharRangeInfo,
|
||||
) {
|
||||
$this->bfCharRangeInfo = $bfCharRangeInfo;
|
||||
if ($this->byteSize < 1) {
|
||||
throw new InvalidArgumentException();
|
||||
}
|
||||
}
|
||||
|
||||
/** @throws PdfParserException */
|
||||
public function textToUnicode(string $characterGroup): string {
|
||||
return implode(
|
||||
'',
|
||||
array_map(
|
||||
fn (string $character) => $this->charToUnicode((int) hexdec($character)) ?? '',
|
||||
str_split($characterGroup, $this->byteSize * 2)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @throws PdfParserException */
|
||||
protected function charToUnicode(int $characterCode): ?string {
|
||||
$char = null;
|
||||
foreach ($this->bfCharRangeInfo as $bfCharRangeInfo) {
|
||||
if (!$bfCharRangeInfo->containsCharacterCode($characterCode)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($char = $bfCharRangeInfo->toUnicode($characterCode)) !== "\0") { // Some characters map to NULL in one BFRange and to an actual character in another
|
||||
return $char;
|
||||
}
|
||||
}
|
||||
|
||||
if ($char === "\0") {
|
||||
return $char; // Only return NULL when it is the only character this is mapped to
|
||||
}
|
||||
|
||||
if ($characterCode === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
enum ToUnicodeCMapOperator: string {
|
||||
case BeginCodeSpaceRange = 'begincodespacerange';
|
||||
case EndCodeSpaceRange = 'endcodespacerange';
|
||||
case BeginBFChar = 'beginbfchar';
|
||||
case EndBFChar = 'endbfchar';
|
||||
case BeginBFRange = 'beginbfrange';
|
||||
case EndBFRange = 'endbfrange';
|
||||
}
|
||||
@ -0,0 +1,91 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CMap\ToUnicode;
|
||||
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Stream\Stream;
|
||||
|
||||
/** @internal */
|
||||
class ToUnicodeCMapParser {
|
||||
/** @throws PdfParserException */
|
||||
public static function parse(Stream $stream, int $startOffset, int $nrOfBytes): ToUnicodeCMap {
|
||||
$beginCodeSpaceRangePos = $stream->firstPos(ToUnicodeCMapOperator::BeginCodeSpaceRange, $startOffset, $startOffset + $nrOfBytes)
|
||||
?? throw new ParseFailureException(sprintf('Missing %s', ToUnicodeCMapOperator::BeginCodeSpaceRange->value));
|
||||
$beginCodeSpaceRangePos += strlen(ToUnicodeCMapOperator::BeginCodeSpaceRange->value);
|
||||
$endCodeSpaceRangePos = $stream->firstPos(ToUnicodeCMapOperator::EndCodeSpaceRange, $beginCodeSpaceRangePos, $startOffset + $nrOfBytes)
|
||||
?? throw new ParseFailureException();
|
||||
$codeSpaceRangeSectionString = $stream->read($beginCodeSpaceRangePos, $endCodeSpaceRangePos - $beginCodeSpaceRangePos);
|
||||
$codeSpaceRanges = [];
|
||||
$byteSize = null;
|
||||
foreach (explode("\n", $codeSpaceRangeSectionString) as $codeSpaceRangeSectionStringLine) {
|
||||
if (trim($codeSpaceRangeSectionStringLine) === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (preg_match('/^\s*<\s*(?P<start>[0-9a-fA-F]+)\s*>\s*<\s*(?P<end>[0-9a-fA-F]+)\s*>\s*$/', $codeSpaceRangeSectionStringLine, $matchesSpaceRange) !== 1) {
|
||||
throw new ParseFailureException('Unrecognized codespacerange format');
|
||||
}
|
||||
|
||||
if (strlen($matchesSpaceRange['start']) !== strlen($matchesSpaceRange['end'])) {
|
||||
throw new ParseFailureException(sprintf('Start(%s) and end(%s) of codespacerange don\'t have the same number of bytes', $matchesSpaceRange['start'], $matchesSpaceRange['end']));
|
||||
}
|
||||
|
||||
if (($strlen = strlen($matchesSpaceRange['start'])) % 2 !== 0 || !is_int($byteSizeRange = $strlen / 2)) {
|
||||
throw new ParseFailureException(sprintf('Codespaceranges must be an even number of hex digits, got %d', $strlen));
|
||||
}
|
||||
|
||||
if ($byteSize !== null && $byteSizeRange !== $byteSize) {
|
||||
throw new ParseFailureException(sprintf('Byte size of codespaceranges is inconsistent, expected %d, got %d', $byteSize, $byteSizeRange));
|
||||
}
|
||||
|
||||
$byteSize = $byteSizeRange;
|
||||
$codeSpaceRanges[] = new CodeSpaceRange((int) hexdec($matchesSpaceRange['start']), (int) hexdec($matchesSpaceRange['end']));
|
||||
}
|
||||
|
||||
/** @var array<int, list<BFRange|BFChar>> $bfCharRangeInfo where the first index is used to track the position of the element in the CMap */
|
||||
$bfCharRangeInfo = [];
|
||||
$lastPos = $startOffset;
|
||||
while (($beginBFCharPos = $stream->firstPos(ToUnicodeCMapOperator::BeginBFChar, $lastPos, $startOffset + $nrOfBytes)) !== null) {
|
||||
$beginBFCharPos += strlen(ToUnicodeCMapOperator::BeginBFChar->value);
|
||||
$endBFCharPos = $stream->firstPos(ToUnicodeCMapOperator::EndBFChar, $beginBFCharPos, $startOffset + $nrOfBytes)
|
||||
?? throw new ParseFailureException();
|
||||
if (preg_match_all('/\s*<(?P<source>[^>]+)>\s*<(?P<destination>[^>]+)>\s*/', $stream->read($beginBFCharPos, $endBFCharPos - $beginBFCharPos), $matchesBFChar, PREG_SET_ORDER) === 0) {
|
||||
throw new ParseFailureException('Unrecognized bfchar format');
|
||||
}
|
||||
|
||||
foreach ($matchesBFChar as $matchBFChar) {
|
||||
$bfCharRangeInfo[$beginBFCharPos][] = new BFChar((int) hexdec(trim($matchBFChar['source'])), trim($matchBFChar['destination']));
|
||||
}
|
||||
$lastPos = $endBFCharPos;
|
||||
}
|
||||
|
||||
$lastPos = $startOffset;
|
||||
while (($beginBFRangePos = $stream->firstPos(ToUnicodeCMapOperator::BeginBFRange, $lastPos, $startOffset + $nrOfBytes)) !== null) {
|
||||
$endBFRangePos = $stream->firstPos(ToUnicodeCMapOperator::EndBFRange, $beginBFRangePos, $startOffset + $nrOfBytes)
|
||||
?? throw new ParseFailureException();
|
||||
if (preg_match_all('/\s*<(?P<start>[^>]+)>\s*<(?P<end>[^>]+)>\s*(?P<targetString>(<[^>]+>)|(\[\s*(<[^>]+>\s*)+\]))/', $stream->read($beginBFRangePos, $endBFRangePos - $beginBFRangePos), $matchesBFRange, PREG_SET_ORDER) === 0) {
|
||||
throw new ParseFailureException('Unrecognized bfrange format');
|
||||
}
|
||||
|
||||
foreach ($matchesBFRange as $matchBFRange) {
|
||||
$bfCharRangeInfo[$beginBFRangePos][] = new BFRange(
|
||||
(int) hexdec(trim($matchBFRange['start'])),
|
||||
(int) hexdec(trim($matchBFRange['end'])),
|
||||
array_map(
|
||||
fn (string $value) => trim($value),
|
||||
explode('><', rtrim(ltrim(str_replace(' ', '', $matchBFRange['targetString']), '[<'), '>]'))
|
||||
)
|
||||
);
|
||||
}
|
||||
$lastPos = $endBFRangePos;
|
||||
}
|
||||
|
||||
ksort($bfCharRangeInfo); // Make sure that Char and Range are in order they occur in the CMap
|
||||
return new ToUnicodeCMap(
|
||||
$codeSpaceRanges,
|
||||
$byteSize !== null ? $byteSize : 2,
|
||||
...array_merge(...$bfCharRangeInfo)
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\CompatibilityOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\InlineImageOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\MarkedContentOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\TextObjectOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ClippingPathOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ColorOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\GraphicsStateOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathConstructionOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathPaintingOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextPositioningOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextShowingOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextStateOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Type3FontOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\XObjectOperator;
|
||||
|
||||
/** @internal */
|
||||
class ContentStreamCommand {
|
||||
public function __construct(
|
||||
public readonly CompatibilityOperator|InlineImageOperator|MarkedContentOperator|TextObjectOperator|ClippingPathOperator|ColorOperator|GraphicsStateOperator|PathConstructionOperator|PathPaintingOperator|TextPositioningOperator|TextShowingOperator|TextStateOperator|Type3FontOperator|XObjectOperator $operator,
|
||||
public readonly string $operands
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 33 - Compatibility operators
|
||||
*/
|
||||
enum CompatibilityOperator: string {
|
||||
case BeginCompatibilitySection = 'BX';
|
||||
case EndCompatibilitySection = 'EX';
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification table 90 - Inline image operators
|
||||
*/
|
||||
enum InlineImageOperator: string {
|
||||
case Begin = 'BI';
|
||||
case BeginImageData = 'ID';
|
||||
case End = 'EI';
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 352 - Marked-content operators
|
||||
*/
|
||||
enum MarkedContentOperator: string {
|
||||
case Tag = 'MD';
|
||||
case TagProperties = 'DP';
|
||||
case BeginMarkedContent = 'BMC';
|
||||
case BeginMarkedContentWithProperties = 'BDC';
|
||||
case EndMarkedContent = 'EMC';
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 105 - Text object operators
|
||||
*/
|
||||
enum TextObjectOperator: string {
|
||||
case BEGIN = 'BT';
|
||||
case END = 'ET';
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 60 - Clipping path operators
|
||||
*/
|
||||
enum ClippingPathOperator: string {
|
||||
case INTERSECT = 'W';
|
||||
case INTERSECT_EVEN_ODD = 'W*';
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification table 73 - Colour operators
|
||||
*/
|
||||
enum ColorOperator: string {
|
||||
case SetName = 'CS';
|
||||
case SetNameNonStroking = 'cs';
|
||||
case SetStrokingColor = 'SC';
|
||||
case SetStrokingParams = 'SCN';
|
||||
case SetColor = 'sc';
|
||||
case SetColorParams = 'scn';
|
||||
case SetStrokingColorSpace = 'G';
|
||||
case SetColorSpace = 'g';
|
||||
case SetStrokingColorDeviceRGB = 'RG';
|
||||
case SetColorDeviceRGB = 'rg';
|
||||
case SetStrokingColorDeviceCMYK = 'K';
|
||||
case SetColorDeviceCMYK = 'k';
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification table 56 - Graphics state operators
|
||||
*/
|
||||
enum GraphicsStateOperator: string implements InteractsWithTransformationMatrix {
|
||||
case SaveCurrentStateToStack = 'q';
|
||||
case RestoreMostRecentStateFromStack = 'Q';
|
||||
case ModifyCurrentTransformationMatrix = 'cm';
|
||||
case SetLineWidth = 'w';
|
||||
case SetLineCap = 'J';
|
||||
case SetLineJoin = 'j';
|
||||
case SetMiterJoin = 'M';
|
||||
case SetLineDash = 'd';
|
||||
case SetIntent = 'ri';
|
||||
case SetFlatness = 'i';
|
||||
case SetDictName = 'gs';
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
#[Override]
|
||||
public function applyToTransformationMatrix(string $operands, TransformationMatrix $transformationMatrix): TransformationMatrix {
|
||||
if ($this === self::ModifyCurrentTransformationMatrix) {
|
||||
$operands = preg_replace('/\s+/', ' ', $operands)
|
||||
?? throw new ParseFailureException('An error occurred while trying to remove duplicate spaces from the operands');
|
||||
|
||||
$matrix = explode(' ', trim($operands));
|
||||
if (count($matrix) !== 6) {
|
||||
throw new ParseFailureException(sprintf('Expected 6 values for matrix transformation, got %d: "%s"', count($matrix), $operands));
|
||||
}
|
||||
|
||||
return $transformationMatrix
|
||||
->multiplyWith(
|
||||
new TransformationMatrix(
|
||||
(float) $matrix[0],
|
||||
(float) $matrix[1],
|
||||
(float) $matrix[2],
|
||||
(float) $matrix[3],
|
||||
(float) $matrix[4],
|
||||
(float) $matrix[5],
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
return $transformationMatrix;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TextState;
|
||||
|
||||
interface InteractsWithTextState {
|
||||
public function applyToTextState(string $operands, ?TextState $textState): ?TextState;
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
|
||||
interface InteractsWithTransformationMatrix {
|
||||
public function applyToTransformationMatrix(string $operands, TransformationMatrix $transformationMatrix): TransformationMatrix;
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\PositionedTextElement;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TextState;
|
||||
|
||||
interface ProducesPositionedTextElements {
|
||||
public function getPositionedTextElement(string $operands, TransformationMatrix $textMatrix, TransformationMatrix $globalTransformationMatrix, TextState $textState): PositionedTextElement;
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 58 - Path construction operators
|
||||
*/
|
||||
enum PathConstructionOperator: string {
|
||||
case MOVE = 'm';
|
||||
case LINE = 'l';
|
||||
case CURVE_BEZIER_123 = 'c';
|
||||
case CURVE_BEZIER_23 = 'v';
|
||||
case CURVE_BEZIER_13 = 'y';
|
||||
case CLOSE = 'h';
|
||||
case RECTANGLE = 're';
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification Table 59 - Path-painting operators
|
||||
*/
|
||||
enum PathPaintingOperator: string {
|
||||
case STROKE = 'S';
|
||||
case CLOSE_STROKE = 's';
|
||||
case FILL = 'f';
|
||||
|
||||
/** Identical to FILL */
|
||||
case FILL_DEPRECATED = 'F';
|
||||
case FILL_EVEN_ODD = 'f*';
|
||||
case FILL_STROKE = 'B';
|
||||
case FILL_STROKE_EVEN_ODD = 'B*';
|
||||
case CLOSE_FILL_STROKE = 'b*';
|
||||
case END = 'n';
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTextState;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TextState;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
/** @internal */
|
||||
enum TextPositioningOperator: string implements InteractsWithTransformationMatrix, InteractsWithTextState {
|
||||
case MOVE_OFFSET = 'Td';
|
||||
case MOVE_OFFSET_LEADING = 'TD';
|
||||
case SET_MATRIX = 'Tm';
|
||||
case NEXT_LINE = 'T*';
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
#[Override]
|
||||
public function applyToTransformationMatrix(string $operands, TransformationMatrix $transformationMatrix): TransformationMatrix {
|
||||
$operands = preg_replace('/\s+/', ' ', $operands) ?? throw new RuntimeException();
|
||||
if ($this === self::MOVE_OFFSET || $this === self::MOVE_OFFSET_LEADING) {
|
||||
$offsets = explode(' ', trim($operands));
|
||||
if (count($offsets) !== 2) {
|
||||
throw new ParseFailureException();
|
||||
}
|
||||
|
||||
return new TransformationMatrix(
|
||||
$transformationMatrix->scaleX,
|
||||
$transformationMatrix->shearX,
|
||||
$transformationMatrix->shearY,
|
||||
$transformationMatrix->scaleY,
|
||||
$transformationMatrix->offsetX + (float) $offsets[0],
|
||||
$transformationMatrix->offsetY + (float) $offsets[1]
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::SET_MATRIX) {
|
||||
$matrix = explode(' ', trim($operands));
|
||||
if (count($matrix) !== 6) {
|
||||
throw new ParseFailureException();
|
||||
}
|
||||
|
||||
return new TransformationMatrix((float) $matrix[0], (float) $matrix[1], (float) $matrix[2], (float) $matrix[3], (float) $matrix[4], (float) $matrix[5]);
|
||||
}
|
||||
|
||||
return $transformationMatrix;
|
||||
}
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
#[Override]
|
||||
public function applyToTextState(string $operands, ?TextState $textState): ?TextState {
|
||||
if ($this === self::MOVE_OFFSET_LEADING) {
|
||||
$offsets = explode(' ', trim($operands));
|
||||
if (count($offsets) !== 2) {
|
||||
throw new ParseFailureException();
|
||||
}
|
||||
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
-1 * (float) $offsets[1],
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
return $textState;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,53 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTextState;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\ProducesPositionedTextElements;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\PositionedTextElement;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TextState;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** @internal */
|
||||
enum TextShowingOperator: string implements InteractsWithTextState, ProducesPositionedTextElements {
|
||||
case SHOW = 'Tj';
|
||||
case MOVE_SHOW = '\'';
|
||||
case MOVE_SHOW_SPACING = '"';
|
||||
case SHOW_ARRAY = 'TJ';
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
#[Override]
|
||||
public function applyToTextState(string $operands, ?TextState $textState): ?TextState {
|
||||
if ($this === self::MOVE_SHOW_SPACING) {
|
||||
$spacing = explode(' ', trim($operands));
|
||||
if (count($spacing) !== 2) {
|
||||
throw new ParseFailureException();
|
||||
}
|
||||
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
(float) $spacing[1],
|
||||
(float) $spacing[0],
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
return $textState;
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public function getPositionedTextElement(string $operands, TransformationMatrix $textMatrix, TransformationMatrix $globalTransformationMatrix, TextState $textState): PositionedTextElement {
|
||||
return new PositionedTextElement(
|
||||
$operands,
|
||||
$globalTransformationMatrix->multiplyWith($textMatrix),
|
||||
$textState
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,124 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTextState;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TextState;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\ExtendedDictionaryKey;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** @internal */
|
||||
enum TextStateOperator: string implements InteractsWithTextState {
|
||||
case CHAR_SPACE = 'Tc';
|
||||
case WORD_SPACE = 'Tw';
|
||||
case SCALE = 'Tz';
|
||||
case LEADING = 'TL';
|
||||
case FONT_SIZE = 'Tf';
|
||||
case RENDER = 'Tr';
|
||||
case RISE = 'Ts';
|
||||
|
||||
/** @throws ParseFailureException|InvalidArgumentException */
|
||||
#[Override]
|
||||
public function applyToTextState(string $operands, ?TextState $textState): TextState {
|
||||
if ($this === self::CHAR_SPACE) {
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
(float) $operands,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::WORD_SPACE) {
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
(float) $operands,
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::SCALE) {
|
||||
if (trim($operands) !== (string)($scale = (int) $operands) && trim($operands) !== (string)($scale = (float) $operands)) {
|
||||
throw new ParseFailureException(sprintf('Invalid scale operand "%s" for scale operator', $operands));
|
||||
}
|
||||
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$scale,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::LEADING) {
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
(float) $operands,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::FONT_SIZE) {
|
||||
if (preg_match('/^\/(?<fontReference>[A-Za-z_0-9\.\-\+]+)\s+(?<FontSize>-?[0-9]+(\.[0-9]+)?)$/', $operands, $matches) !== 1) {
|
||||
throw new InvalidArgumentException(sprintf('Invalid font operand "%s" for Tf operator', substr($operands, 0, 200)));
|
||||
}
|
||||
|
||||
return new TextState(
|
||||
DictionaryKey::tryFrom($matches['fontReference']) ?? new ExtendedDictionaryKey($matches['fontReference']),
|
||||
(float) $matches['FontSize'],
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
if ($this === self::RENDER) {
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
(int) $operands,
|
||||
$textState->rise ?? 0,
|
||||
);
|
||||
}
|
||||
|
||||
return new TextState(
|
||||
$textState->fontName ?? null,
|
||||
$textState->fontSize ?? null,
|
||||
$textState->charSpace ?? 0,
|
||||
$textState->wordSpace ?? 0,
|
||||
$textState->scale ?? 100,
|
||||
$textState->leading ?? 0,
|
||||
$textState->render ?? 0,
|
||||
(float) $operands,
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification table 111 - Type 3 font operators
|
||||
*/
|
||||
enum Type3FontOperator: string {
|
||||
case SetWidth = 'd0';
|
||||
case SetWidthAndBoundingBox = 'd1';
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*
|
||||
* @specification table 86 - XObject operator
|
||||
*/
|
||||
enum XObjectOperator: string {
|
||||
case Paint = 'Do';
|
||||
}
|
||||
101
includes/pdfparser/Document/ContentStream/ContentStream.php
Normal file
101
includes/pdfparser/Document/ContentStream/ContentStream.php
Normal file
@ -0,0 +1,101 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\ContentStreamCommand;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\GraphicsStateOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTextState;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\ProducesPositionedTextElements;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Object\TextObject;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\PositionedTextElement;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
|
||||
use PrinsFrank\PdfParser\Document\Document;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Page;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
|
||||
/** @api */
|
||||
class ContentStream {
|
||||
/** @var list<TextObject|ContentStreamCommand> */
|
||||
public readonly array $content;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
TextObject|ContentStreamCommand... $content
|
||||
) {
|
||||
$this->content = $content;
|
||||
}
|
||||
|
||||
/** @return list<PositionedTextElement> */
|
||||
public function getPositionedTextElements(): array {
|
||||
$positionedTextElements = $transformationStateStack = [];
|
||||
$textState = null; // See table 103, Tf operator for initial value
|
||||
$transformationMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix
|
||||
foreach ($this->content as $content) {
|
||||
if ($content instanceof ContentStreamCommand) {
|
||||
if ($content->operator instanceof InteractsWithTextState) {
|
||||
$textState = $content->operator->applyToTextState($content->operands, $textState);
|
||||
} elseif ($content->operator === GraphicsStateOperator::SaveCurrentStateToStack) {
|
||||
$transformationStateStack[] = clone $transformationMatrix;
|
||||
} elseif ($content->operator === GraphicsStateOperator::RestoreMostRecentStateFromStack) {
|
||||
$transformationMatrix = array_pop($transformationStateStack)
|
||||
?? throw new ParseFailureException();
|
||||
} elseif ($content->operator instanceof InteractsWithTransformationMatrix) {
|
||||
$transformationMatrix = $content->operator->applyToTransformationMatrix($content->operands, $transformationMatrix);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$textMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix, See Table 106, Tm operator for initial value in text object
|
||||
foreach ($content->contentStreamCommands as $contentStreamCommand) {
|
||||
if ($contentStreamCommand->operator instanceof InteractsWithTextState) {
|
||||
$textState = $contentStreamCommand->operator->applyToTextState($contentStreamCommand->operands, $textState);
|
||||
}
|
||||
|
||||
if ($contentStreamCommand->operator instanceof InteractsWithTransformationMatrix) {
|
||||
$textMatrix = $contentStreamCommand->operator->applyToTransformationMatrix($contentStreamCommand->operands, $textMatrix);
|
||||
}
|
||||
|
||||
if ($contentStreamCommand->operator instanceof ProducesPositionedTextElements && $textState !== null) {
|
||||
$positionedTextElements[] = $contentStreamCommand->operator->getPositionedTextElement($contentStreamCommand->operands, $textMatrix, $transformationMatrix, $textState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
usort(
|
||||
$positionedTextElements,
|
||||
static function (PositionedTextElement $a, PositionedTextElement $b): int {
|
||||
if (($differenceY = $b->absoluteMatrix->offsetY <=> $a->absoluteMatrix->offsetY) !== 0) {
|
||||
return $differenceY;
|
||||
}
|
||||
|
||||
return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX;
|
||||
}
|
||||
);
|
||||
|
||||
return $positionedTextElements;
|
||||
}
|
||||
|
||||
/** @throws PdfParserException */
|
||||
public function getText(Document $document, Page $page): string {
|
||||
$text = '';
|
||||
$previousPositionedTextElement = null;
|
||||
foreach ($this->getPositionedTextElements() as $positionedTextElement) {
|
||||
if ($previousPositionedTextElement !== null) {
|
||||
if ($previousPositionedTextElement->absoluteMatrix->offsetY !== $positionedTextElement->absoluteMatrix->offsetY) {
|
||||
$text .= "\n";
|
||||
} elseif (($positionedTextElement->absoluteMatrix->offsetX - $previousPositionedTextElement->absoluteMatrix->offsetX - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) {
|
||||
$text .= ' ';
|
||||
}
|
||||
}
|
||||
|
||||
$text .= $positionedTextElement->getText($document, $page);
|
||||
$previousPositionedTextElement = $positionedTextElement;
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,217 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\ContentStreamCommand;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\CompatibilityOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\InlineImageOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\MarkedContentOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\TextObjectOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ClippingPathOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ColorOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\GraphicsStateOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathConstructionOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathPaintingOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextPositioningOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextShowingOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextStateOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Type3FontOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\XObjectOperator;
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Object\TextObject;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\DecoratedObject;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** @internal */
|
||||
class ContentStreamParser {
|
||||
/**
|
||||
* @param list<DecoratedObject> $contentsObjects
|
||||
* @throws ParseFailureException
|
||||
*/
|
||||
public static function parse(array $contentsObjects): ContentStream {
|
||||
$content = [];
|
||||
$inStringLiteral = $inResourceName = $inDictionary = false;
|
||||
$inArrayLevel = $inStringLevel = 0;
|
||||
$textObject = $previousChar = $secondToLastChar = $thirdToLastChar = $previousContentStream = $startPreviousOperandIndex = null;
|
||||
foreach ($contentsObjects as $contentsObject) {
|
||||
$startCurrentOperandIndex = 0;
|
||||
$contentStream = $contentsObject->getStream();
|
||||
$contentStreamSize = $contentStream->getSizeInBytes();
|
||||
for ($index = 0; $index < $contentStreamSize; $index++) {
|
||||
$char = $contentStream->read($index, 1);
|
||||
if ($inStringLiteral === true) {
|
||||
if ($char === ')' && $previousChar !== '\\') {
|
||||
$inStringLiteral = false;
|
||||
}
|
||||
} elseif ($inResourceName === true) {
|
||||
if (in_array($char, [' ', '<', '(', '/', "\r", "\n"], true) && $previousChar !== '\\') {
|
||||
$inResourceName = false;
|
||||
}
|
||||
} elseif ($inDictionary === true) {
|
||||
if ($char === '>' && $previousChar === '>' && $secondToLastChar !== '\\') {
|
||||
$inDictionary = false;
|
||||
}
|
||||
} elseif ($char === '[' && $previousChar !== '\\') {
|
||||
$inArrayLevel++;
|
||||
} elseif ($char === '<' && $previousChar === '<' && $secondToLastChar !== '\\') {
|
||||
$inDictionary = true;
|
||||
} elseif ($char === '<' && $previousChar !== '\\' && $contentStream->read($index + 1, 1) !== '<') {
|
||||
$inStringLevel++;
|
||||
} elseif ($char === '(' && $previousChar !== '\\') {
|
||||
$inStringLiteral = true;
|
||||
} elseif ($char === '/' && $previousChar !== '\\') {
|
||||
$inResourceName = true;
|
||||
} elseif ($inStringLevel > 0 || $inArrayLevel > 0) {
|
||||
if ($inStringLevel > 0 && $char === '>' && $previousChar !== '\\') {
|
||||
$inStringLevel--;
|
||||
} elseif ($inArrayLevel > 0 && $char === ']' && $previousChar !== '\\') {
|
||||
$inArrayLevel--;
|
||||
}
|
||||
} elseif ($char === 'T' && $previousChar === 'B') { // TextObjectOperator::BEGIN
|
||||
$startCurrentOperandIndex = $index + 1;
|
||||
$textObject = new TextObject();
|
||||
} elseif ($char === 'T' && $previousChar === 'E') { // TextObjectOperator::END
|
||||
$startCurrentOperandIndex = $index + 1;
|
||||
if ($textObject === null) {
|
||||
throw new ParseFailureException('Encountered TextObjectOperator::END without preceding TextObjectOperator::BEGIN');
|
||||
}
|
||||
|
||||
$content[] = $textObject;
|
||||
$textObject = null;
|
||||
} elseif ($char === 'C'
|
||||
&& (($secondToLastChar === 'B' && ($previousChar === 'M' || $previousChar === 'D')) || ($secondToLastChar === 'E' && $previousChar === 'M'))) { // MarkedContentOperator::BeginMarkedContent, MarkedContentOperator::EndMarkedContent, MarkedContentOperator::BeginMarkedContentWithProperties
|
||||
$startCurrentOperandIndex = $index + 1;
|
||||
} elseif (($operator = self::getOperator($char, $previousChar, $secondToLastChar, $thirdToLastChar)) !== null
|
||||
&& (($nextChar = $contentStream->read($index + 1, 1)) === '' || self::getOperator($nextChar, $char, $previousChar, $secondToLastChar) === null)) { // Skip the current hit if the next iteration is also a valid operator
|
||||
$operands = '';
|
||||
if ($previousContentStream !== null && $startPreviousOperandIndex !== null && $startPreviousOperandIndex < $previousContentStream->getSizeInBytes()) {
|
||||
$operands .= $previousContentStream->read($startPreviousOperandIndex, $previousContentStream->getSizeInBytes() - $startPreviousOperandIndex);
|
||||
$startPreviousOperandIndex = null;
|
||||
}
|
||||
if (($operandLength = $index + 1 - $startCurrentOperandIndex - strlen($operator->value)) > 0) {
|
||||
$operands .= $contentStream->read($startCurrentOperandIndex, $operandLength);
|
||||
}
|
||||
|
||||
$command = new ContentStreamCommand($operator, trim($operands));
|
||||
if ($textObject !== null) {
|
||||
$textObject->addContentStreamCommand($command);
|
||||
} else {
|
||||
$content[] = $command;
|
||||
}
|
||||
|
||||
$startCurrentOperandIndex = $index + 1;
|
||||
}
|
||||
|
||||
$thirdToLastChar = $secondToLastChar;
|
||||
$secondToLastChar = $previousChar;
|
||||
$previousChar = $char;
|
||||
}
|
||||
|
||||
$previousContentStream = $contentStream;
|
||||
$startPreviousOperandIndex = $startCurrentOperandIndex;
|
||||
}
|
||||
|
||||
return new ContentStream(...$content);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method uses three maps instead of calling $enum::tryFrom for all possible enums
|
||||
* as operator retrieval happens possibly millions of times in a single file
|
||||
*/
|
||||
public static function getOperator(string $currentChar, ?string $previousChar, ?string $secondToLastChar, ?string $thirdToLastChar): CompatibilityOperator|InlineImageOperator|MarkedContentOperator|TextObjectOperator|ClippingPathOperator|ColorOperator|GraphicsStateOperator|PathConstructionOperator|PathPaintingOperator|TextPositioningOperator|TextShowingOperator|TextStateOperator|Type3FontOperator|XObjectOperator|null {
|
||||
$threeLetterMatch = match ($secondToLastChar . $previousChar . $currentChar) {
|
||||
'BMC' => MarkedContentOperator::BeginMarkedContent,
|
||||
'BDC' => MarkedContentOperator::BeginMarkedContentWithProperties,
|
||||
'EMC' => MarkedContentOperator::EndMarkedContent,
|
||||
'SCN' => ColorOperator::SetStrokingParams,
|
||||
'scn' => ColorOperator::SetColorParams,
|
||||
default => null,
|
||||
};
|
||||
if ($threeLetterMatch !== null) {
|
||||
return in_array($thirdToLastChar, ['\\', '/'], true) ? null : $threeLetterMatch;
|
||||
}
|
||||
|
||||
$twoLetterMatch = match ($previousChar . $currentChar) {
|
||||
'BX' => CompatibilityOperator::BeginCompatibilitySection,
|
||||
'EX' => CompatibilityOperator::EndCompatibilitySection,
|
||||
'BI' => InlineImageOperator::Begin,
|
||||
'ID' => InlineImageOperator::BeginImageData,
|
||||
'EI' => InlineImageOperator::End,
|
||||
'MD' => MarkedContentOperator::Tag,
|
||||
'DP' => MarkedContentOperator::TagProperties,
|
||||
'BT' => TextObjectOperator::BEGIN,
|
||||
'ET' => TextObjectOperator::END,
|
||||
'W*' => ClippingPathOperator::INTERSECT_EVEN_ODD,
|
||||
'CS' => ColorOperator::SetName,
|
||||
'cs' => ColorOperator::SetNameNonStroking,
|
||||
'SC' => ColorOperator::SetStrokingColor,
|
||||
'sc' => ColorOperator::SetColor,
|
||||
'RG' => ColorOperator::SetStrokingColorDeviceRGB,
|
||||
'rg' => ColorOperator::SetColorDeviceRGB,
|
||||
'cm' => GraphicsStateOperator::ModifyCurrentTransformationMatrix,
|
||||
'ri' => GraphicsStateOperator::SetIntent,
|
||||
'gs' => GraphicsStateOperator::SetDictName,
|
||||
're' => PathConstructionOperator::RECTANGLE,
|
||||
'f*' => PathPaintingOperator::FILL_EVEN_ODD,
|
||||
'B*' => PathPaintingOperator::FILL_STROKE_EVEN_ODD,
|
||||
'b*' => PathPaintingOperator::CLOSE_FILL_STROKE,
|
||||
'Td' => TextPositioningOperator::MOVE_OFFSET,
|
||||
'TD' => TextPositioningOperator::MOVE_OFFSET_LEADING,
|
||||
'Tm' => TextPositioningOperator::SET_MATRIX,
|
||||
'T*' => TextPositioningOperator::NEXT_LINE,
|
||||
'Tj' => TextShowingOperator::SHOW,
|
||||
'TJ' => TextShowingOperator::SHOW_ARRAY,
|
||||
'Tc' => TextStateOperator::CHAR_SPACE,
|
||||
'Tw' => TextStateOperator::WORD_SPACE,
|
||||
'Tz' => TextStateOperator::SCALE,
|
||||
'TL' => TextStateOperator::LEADING,
|
||||
'Tf' => TextStateOperator::FONT_SIZE,
|
||||
'Tr' => TextStateOperator::RENDER,
|
||||
'Ts' => TextStateOperator::RISE,
|
||||
'd0' => Type3FontOperator::SetWidth,
|
||||
'd1' => Type3FontOperator::SetWidthAndBoundingBox,
|
||||
'Do' => XObjectOperator::Paint,
|
||||
default => null,
|
||||
};
|
||||
if ($twoLetterMatch !== null) {
|
||||
return in_array($secondToLastChar, ['\\', '/'], true) ? null : $twoLetterMatch;
|
||||
}
|
||||
|
||||
$oneLetterMatch = match ($currentChar) {
|
||||
'W' => ClippingPathOperator::INTERSECT,
|
||||
'G' => ColorOperator::SetStrokingColorSpace,
|
||||
'g' => ColorOperator::SetColorSpace,
|
||||
'K' => ColorOperator::SetStrokingColorDeviceCMYK,
|
||||
'k' => ColorOperator::SetColorDeviceCMYK,
|
||||
'q' => GraphicsStateOperator::SaveCurrentStateToStack,
|
||||
'Q' => GraphicsStateOperator::RestoreMostRecentStateFromStack,
|
||||
'w' => GraphicsStateOperator::SetLineWidth,
|
||||
'J' => GraphicsStateOperator::SetLineCap,
|
||||
'j' => GraphicsStateOperator::SetLineJoin,
|
||||
'M' => GraphicsStateOperator::SetMiterJoin,
|
||||
'd' => GraphicsStateOperator::SetLineDash,
|
||||
'i' => GraphicsStateOperator::SetFlatness,
|
||||
'm' => PathConstructionOperator::MOVE,
|
||||
'l' => PathConstructionOperator::LINE,
|
||||
'c' => PathConstructionOperator::CURVE_BEZIER_123,
|
||||
'v' => PathConstructionOperator::CURVE_BEZIER_23,
|
||||
'y' => PathConstructionOperator::CURVE_BEZIER_13,
|
||||
'h' => PathConstructionOperator::CLOSE,
|
||||
'S' => PathPaintingOperator::STROKE,
|
||||
's' => PathPaintingOperator::CLOSE_STROKE,
|
||||
'f' => PathPaintingOperator::FILL,
|
||||
'F' => PathPaintingOperator::FILL_DEPRECATED,
|
||||
'B' => PathPaintingOperator::FILL_STROKE,
|
||||
'n' => PathPaintingOperator::END,
|
||||
'\'' => TextShowingOperator::MOVE_SHOW,
|
||||
'"' => TextShowingOperator::MOVE_SHOW_SPACING,
|
||||
default => null,
|
||||
};
|
||||
|
||||
if ($oneLetterMatch !== null) {
|
||||
return in_array($previousChar, ['\\', '/'], true) ? null : $oneLetterMatch;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\Object;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\ContentStream\Command\ContentStreamCommand;
|
||||
|
||||
/** @internal */
|
||||
class TextObject {
|
||||
/** @var list<ContentStreamCommand> */
|
||||
public array $contentStreamCommands = [];
|
||||
|
||||
public function addContentStreamCommand(ContentStreamCommand $textOperator): self {
|
||||
$this->contentStreamCommands[] = $textOperator;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function isEmpty(): bool {
|
||||
return $this->contentStreamCommands === [];
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,106 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\PositionedText;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\EncodingNameValue;
|
||||
use PrinsFrank\PdfParser\Document\Document;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Character\LiteralStringEscapeCharacter;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Font;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Page;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
class PositionedTextElement {
|
||||
public function __construct(
|
||||
public readonly string $rawTextContent,
|
||||
public readonly TransformationMatrix $absoluteMatrix,
|
||||
public readonly TextState $textState,
|
||||
) {
|
||||
}
|
||||
|
||||
public function getFont(Document $document, Page $page): Font {
|
||||
if ($this->textState->fontName === null) {
|
||||
throw new ParseFailureException('Unable to locate font for text element');
|
||||
}
|
||||
|
||||
return $page->getFontDictionary()?->getObjectForReference($document, $this->textState->fontName, Font::class)
|
||||
?? throw new ParseFailureException(sprintf('Unable to locate font with reference "/%s"', $this->textState->fontName->value));
|
||||
}
|
||||
|
||||
/** @throws ParseFailureException */
|
||||
public function getText(Document $document, Page $page): string {
|
||||
if (($result = preg_match_all('/(?<chars>(<(\\\\>|[^>])*>)|(\((\\\\\)|[^)])*\)))(?<offset>-?[0-9]+(\.[0-9]+)?)?/', $this->rawTextContent, $matches, PREG_SET_ORDER)) === false) {
|
||||
throw new ParseFailureException(sprintf('Error with regex'));
|
||||
} elseif ($result === 0) {
|
||||
throw new ParseFailureException(sprintf('Operands "%s" is not in a recognized format', $this->rawTextContent));
|
||||
}
|
||||
|
||||
$string = '';
|
||||
$font = $this->getFont($document, $page);
|
||||
foreach ($matches as $match) {
|
||||
if (str_starts_with($match['chars'], '(') && str_ends_with($match['chars'], ')')) {
|
||||
$unescapedChars = LiteralStringEscapeCharacter::unescapeCharacters(substr($match['chars'], 1, -1));
|
||||
if (preg_match('/^\\\\\d{3}$/', substr($match['chars'], 1, -1)) === 1 && ($glyph = $font->getDifferences()?->getGlyph((int) octdec(substr($match['chars'], 2, -1)))) !== null) {
|
||||
$chars = $glyph->getChar();
|
||||
} elseif (strlen($unescapedChars) === 1 && ($glyph = $font->getDifferences()?->getGlyph(ord($unescapedChars))) !== null) {
|
||||
$chars = $glyph->getChar();
|
||||
} elseif (in_array($encoding = $font->getEncoding(), [EncodingNameValue::MacExpertEncoding, EncodingNameValue::WinAnsiEncoding], true)) {
|
||||
$chars = $encoding->decodeString($unescapedChars);
|
||||
} elseif (($toUnicodeCMap = $font->getToUnicodeCMap() ?? $font->getToUnicodeCMapDescendantFont()) !== null) {
|
||||
$chars = $toUnicodeCMap->textToUnicode(bin2hex($unescapedChars));
|
||||
} elseif ($encoding !== null) {
|
||||
$chars = $encoding->decodeString($unescapedChars);
|
||||
} else {
|
||||
$chars = $unescapedChars;
|
||||
}
|
||||
|
||||
$string .= $chars;
|
||||
} elseif (str_starts_with($match['chars'], '<') && str_ends_with($match['chars'], '>')) {
|
||||
$chars = substr($match['chars'], 1, -1);
|
||||
if (($toUnicodeCMap = $font->getToUnicodeCMap() ?? $font->getToUnicodeCMapDescendantFont()) !== null) {
|
||||
$string .= $toUnicodeCMap->textToUnicode($chars);
|
||||
} elseif (($encoding = $font->getEncoding()) !== null) {
|
||||
$string .= $encoding->decodeString(implode('', array_map(fn (string $character) => mb_chr((int) hexdec($character)), str_split($chars, 2))));
|
||||
} else {
|
||||
throw new ParseFailureException('Unable to use CMap or decode string to retrieve characters for text object');
|
||||
}
|
||||
} else {
|
||||
throw new ParseFailureException(sprintf('Unrecognized character group format "%s"', $match['chars']));
|
||||
}
|
||||
|
||||
if (isset($match['offset']) && (float) $match['offset'] < -100) {
|
||||
$string .= ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return $string;
|
||||
}
|
||||
|
||||
/** @return list<int> */
|
||||
public function getCodePoints(): array {
|
||||
$codePoints = [];
|
||||
if (($result = preg_match_all('/(?<chars>(<(\\\\>|[^>])*>)|(\((\\\\\)|[^)])*\)))(?<offset>-?[0-9]+(\.[0-9]+)?)?/', $this->rawTextContent, $matches, PREG_SET_ORDER)) === false) {
|
||||
throw new ParseFailureException(sprintf('Error with regex'));
|
||||
} elseif ($result === 0) {
|
||||
throw new ParseFailureException(sprintf('Operands "%s" is not in a recognized format', $this->rawTextContent));
|
||||
}
|
||||
|
||||
foreach ($matches as $match) {
|
||||
if (str_starts_with($match['chars'], '(') && str_ends_with($match['chars'], ')')) {
|
||||
$chars = str_replace(['\(', '\)', '\n', '\r'], ['(', ')', "\n", "\r"], substr($match['chars'], 1, -1));
|
||||
$chars = preg_replace_callback('/\\\\([0-7]{3})/', fn (array $matches) => mb_chr((int) octdec($matches[1])), $chars)
|
||||
?? throw new ParseFailureException();
|
||||
foreach (str_split($chars) as $char) {
|
||||
$codePoints[] = ord($char);
|
||||
}
|
||||
} elseif (str_starts_with($match['chars'], '<') && str_ends_with($match['chars'], '>')) {
|
||||
foreach (str_split(substr($match['chars'], 1, -1), 4) as $char) {
|
||||
$codePoints[] = is_int($codePoint = hexdec($char)) ? $codePoint : throw new ParseFailureException();
|
||||
}
|
||||
} else {
|
||||
throw new ParseFailureException(sprintf('Unrecognized character group format "%s"', $match['chars']));
|
||||
}
|
||||
}
|
||||
|
||||
return $codePoints;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\PositionedText;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\ExtendedDictionaryKey;
|
||||
|
||||
class TextState {
|
||||
public function __construct(
|
||||
public readonly DictionaryKey|ExtendedDictionaryKey|null $fontName, // Tf
|
||||
public readonly ?float $fontSize, // Tfs
|
||||
public float $charSpace = 0, // Tc
|
||||
public float $wordSpace = 0, // Tw
|
||||
public float $scale = 100, // Th
|
||||
public float $leading = 0, // Tl
|
||||
public int $render = 0, // Tmode
|
||||
public float $rise = 0, // Trise
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\ContentStream\PositionedText;
|
||||
|
||||
class TransformationMatrix {
|
||||
public function __construct(
|
||||
public readonly float $scaleX, // a
|
||||
public readonly float $shearX, // b
|
||||
public readonly float $shearY, // c
|
||||
public readonly float $scaleY, // d
|
||||
public readonly float $offsetX, // e
|
||||
public readonly float $offsetY, // f
|
||||
) {
|
||||
}
|
||||
|
||||
/** Please note that a concatenated transformation matrix of A B !== B A */
|
||||
public function multiplyWith(self $other): self {
|
||||
return new self(
|
||||
$this->scaleX * $other->scaleX + $this->shearX * $other->shearY,
|
||||
$this->scaleX * $other->shearX + $this->shearX * $other->scaleY,
|
||||
$this->shearY * $other->scaleX + $this->scaleY * $other->shearY,
|
||||
$this->shearY * $other->shearX + $this->scaleY * $other->scaleY,
|
||||
$this->offsetX * $other->scaleX + $this->offsetY * $other->shearY + $other->offsetX,
|
||||
$this->offsetX * $other->shearX + $this->offsetY * $other->scaleY + $other->offsetY,
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,92 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\CrossReferenceSource;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Stream\CrossReferenceStreamParser;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Table\CrossReferenceTableParser;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Integer\IntegerValue;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Marker;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Stream\Stream;
|
||||
|
||||
/** @internal */
|
||||
class CrossReferenceSourceParser {
|
||||
/** @throws PdfParserException */
|
||||
public static function parse(Stream $stream): CrossReferenceSource {
|
||||
$eofMarkerPos = $stream->lastPos(Marker::EOF, 0)
|
||||
?? throw new ParseFailureException(sprintf('Unable to locate marker %s', Marker::EOF->value));
|
||||
$startXrefMarkerPos = $stream->lastPos(Marker::START_XREF, $stream->getSizeInBytes() - $eofMarkerPos)
|
||||
?? throw new ParseFailureException(sprintf('Unable to locate marker %s', Marker::START_XREF->value));
|
||||
$startByteOffset = $stream->getStartOfNextLine($startXrefMarkerPos, $stream->getSizeInBytes())
|
||||
?? throw new ParseFailureException('Expected a carriage return or line feed after startxref marker, none found');
|
||||
$endByteOffset = $stream->getEndOfCurrentLine($startByteOffset, $stream->getSizeInBytes())
|
||||
?? throw new ParseFailureException('Expected a carriage return or line feed after the byte offset, none found');
|
||||
|
||||
$byteOffsetLastCrossReferenceSection = trim($stream->read($startByteOffset, $endByteOffset - $startByteOffset));
|
||||
if ($byteOffsetLastCrossReferenceSection !== (string)(int) $byteOffsetLastCrossReferenceSection) {
|
||||
throw new ParseFailureException(sprintf('Invalid byte offset last crossReference section "%s", "%s"', $byteOffsetLastCrossReferenceSection, $stream->read($startXrefMarkerPos, $stream->getSizeInBytes() - $startXrefMarkerPos)));
|
||||
}
|
||||
|
||||
$byteOffsetLastCrossReferenceSection = (int) $byteOffsetLastCrossReferenceSection;
|
||||
if ($byteOffsetLastCrossReferenceSection > $stream->getSizeInBytes()) {
|
||||
throw new ParseFailureException(sprintf('Invalid byte offset: position of last crossReference section %d is greater than total size of stream %d. Should this be %d?', $byteOffsetLastCrossReferenceSection, $stream->getSizeInBytes(), $stream->lastPos(Marker::XREF, $stream->getSizeInBytes() - $startXrefMarkerPos) ?? $stream->lastPos(Marker::OBJ, $stream->getSizeInBytes() - $startXrefMarkerPos) ?? 0));
|
||||
}
|
||||
|
||||
$eolPosByteOffset = $stream->getEndOfCurrentLine($byteOffsetLastCrossReferenceSection, $stream->getSizeInBytes())
|
||||
?? throw new ParseFailureException('Expected a newline after byte offset for last cross reference stream');
|
||||
|
||||
$crossReferenceType = self::getCrossReferenceType($stream, $byteOffsetLastCrossReferenceSection, $eolPosByteOffset);
|
||||
if ($crossReferenceType === null) { // Try to recover from an invalid byte offset crossReference section
|
||||
$lastPosXrefSection = $stream->lastPos(Marker::XREF, $stream->getSizeInBytes() - $startXrefMarkerPos);
|
||||
$lastPosObject = $stream->lastPos(Marker::OBJ, $stream->getSizeInBytes() - $startXrefMarkerPos);
|
||||
if ($lastPosXrefSection === null && $lastPosObject === null) {
|
||||
throw new ParseFailureException(sprintf('Unable to determine cross reference type for start line "%s" of crossReference source, and no other crossReference table or stream was found.', $stream->read($byteOffsetLastCrossReferenceSection, $eolPosByteOffset - $byteOffsetLastCrossReferenceSection)));
|
||||
}
|
||||
|
||||
$lastPossibleXrefSectionPos = $lastPosObject === null ? $lastPosXrefSection : ($lastPosXrefSection === null ? $lastPosObject : max($lastPosXrefSection, $lastPosObject));
|
||||
$eolStartXrefSectionPos = $stream->getEndOfCurrentLine($lastPossibleXrefSectionPos, $stream->getSizeInBytes())
|
||||
?? throw new ParseFailureException(sprintf('Unable to determine cross reference type for start line "%s" of crossReference source, and no other crossReference table or stream was found.', $stream->read($startByteOffset, $endByteOffset - $startByteOffset)));
|
||||
$crossReferenceType = self::getCrossReferenceType($stream, $lastPossibleXrefSectionPos, $eolStartXrefSectionPos)
|
||||
?? throw new ParseFailureException(sprintf('Unable to determine cross reference type for start line "%s" of crossReference source, and no other crossReference table or stream was found.', $stream->read($startByteOffset, $endByteOffset - $startByteOffset)));
|
||||
}
|
||||
|
||||
$endCrossReferenceSection = $crossReferenceType === CrossReferenceType::Table
|
||||
? ($stream->firstPos(Marker::START_XREF, $eolPosByteOffset, $stream->getSizeInBytes()) ?? throw new ParseFailureException(sprintf('Unable to locate marker %s', Marker::START_XREF->value)))
|
||||
: ($stream->firstPos(Marker::END_OBJ, $eolPosByteOffset, $stream->getSizeInBytes()) ?? throw new ParseFailureException(sprintf('Unable to locate marker %s', Marker::END_OBJ->value)));
|
||||
$currentCrossReferenceSection = $crossReferenceType === CrossReferenceType::Table
|
||||
? CrossReferenceTableParser::parse($stream, $eolPosByteOffset, $endCrossReferenceSection - $eolPosByteOffset)
|
||||
: CrossReferenceStreamParser::parse($stream, $eolPosByteOffset, $endCrossReferenceSection - $eolPosByteOffset);
|
||||
$crossReferenceSections = [$currentCrossReferenceSection];
|
||||
while (($previous = $currentCrossReferenceSection->dictionary->getValueForKey(DictionaryKey::PREV, IntegerValue::class)) !== null && $previous->value !== 0) {
|
||||
$eolPosByteOffset = $stream->getEndOfCurrentLine($previous->value + 1, $stream->getSizeInBytes())
|
||||
?? throw new ParseFailureException('Expected a newline after byte offset for cross reference stream');
|
||||
$endCrossReferenceSection = $crossReferenceType === CrossReferenceType::Table
|
||||
? $stream->firstPos(Marker::START_XREF, $eolPosByteOffset, $stream->getSizeInBytes()) ?? throw new ParseFailureException('Unable to locate startxref')
|
||||
: $stream->firstPos(Marker::END_OBJ, $eolPosByteOffset, $stream->getSizeInBytes()) ?? throw new ParseFailureException('Unable to locate endobj');
|
||||
|
||||
$currentCrossReferenceSection = $crossReferenceType === CrossReferenceType::Table
|
||||
? CrossReferenceTableParser::parse($stream, $eolPosByteOffset, $endCrossReferenceSection - $eolPosByteOffset)
|
||||
: CrossReferenceStreamParser::parse($stream, $eolPosByteOffset, $endCrossReferenceSection - $eolPosByteOffset);
|
||||
$crossReferenceSections[] = $currentCrossReferenceSection;
|
||||
}
|
||||
|
||||
return new CrossReferenceSource(... $crossReferenceSections);
|
||||
}
|
||||
|
||||
private static function getCrossReferenceType(Stream $stream, int $byteOffsetLastCrossReferenceSection, int $byteOffsetEndOfCurrentLine): ?CrossReferenceType {
|
||||
$startCrossReferenceContent = trim($stream->read($byteOffsetLastCrossReferenceSection, $byteOffsetEndOfCurrentLine - $byteOffsetLastCrossReferenceSection));
|
||||
if ($startCrossReferenceContent === Marker::XREF->value) {
|
||||
return CrossReferenceType::Table;
|
||||
}
|
||||
|
||||
if (preg_match('/^[0-9]*\s*[0-9]*\s*obj$/', $startCrossReferenceContent) === 1) {
|
||||
return CrossReferenceType::Stream;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference;
|
||||
|
||||
enum CrossReferenceType {
|
||||
case Table;
|
||||
case Stream;
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\CrossReferenceSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryCompressed;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\ArrayValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
/** Can be both from a crossReferenceTable or a crossReferenceStream */
|
||||
class CrossReferenceSource {
|
||||
/** @var list<CrossReferenceSection> Where the first is the newest incremental update and the last one is the oldest */
|
||||
private readonly array $crossReferenceSections;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
CrossReferenceSection... $crossReferenceSections,
|
||||
) {
|
||||
$this->crossReferenceSections = $crossReferenceSections;
|
||||
}
|
||||
|
||||
public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUseObject|CrossReferenceEntryCompressed|null {
|
||||
foreach ($this->crossReferenceSections as $crossReferenceSection) {
|
||||
$crossReferenceEntry = $crossReferenceSection->getCrossReferenceEntry($objNumber);
|
||||
if ($crossReferenceEntry !== null) {
|
||||
return $crossReferenceEntry;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getReferenceForKey(DictionaryKey $dictionaryKey): ?ReferenceValue {
|
||||
return $this->getValueForKey($dictionaryKey, ReferenceValue::class);
|
||||
}
|
||||
|
||||
/**
|
||||
* @template T of DictionaryValue|NameValue|Dictionary
|
||||
* @param class-string<T> $valueType
|
||||
* @return T
|
||||
*/
|
||||
public function getValueForKey(DictionaryKey $dictionaryKey, string $valueType): DictionaryValue|Dictionary|NameValue|null {
|
||||
foreach ($this->crossReferenceSections as $crossReferenceSection) {
|
||||
$valueForKey = $crossReferenceSection->dictionary->getValueForKey($dictionaryKey, $valueType);
|
||||
if ($valueForKey !== null) {
|
||||
return $valueForKey;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getFirstId(): string {
|
||||
$value = $this->getValueForKey(DictionaryKey::ID, ArrayValue::class)->value[0]
|
||||
?? throw new ParseFailureException('Unable to retrieve first id from cross reference source');
|
||||
if (!is_string($value)) {
|
||||
throw new ParseFailureException('First id is not a string');
|
||||
}
|
||||
|
||||
if (!str_starts_with($value, '<') || !str_ends_with($value, '>')) {
|
||||
throw new ParseFailureException('Unsupported first id format, expected "<hex>"');
|
||||
}
|
||||
|
||||
$firstId = hex2bin(substr($value, 1, -1));
|
||||
if ($firstId === false) {
|
||||
throw new ParseFailureException('Unable to retrieve binary value from first id');
|
||||
}
|
||||
|
||||
return $firstId;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source\Section;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\CrossReferenceSubSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryCompressed;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
|
||||
/** There are multiple crossReference sections if there are incremental updates. See 7.5.6 */
|
||||
class CrossReferenceSection {
|
||||
/** @var list<CrossReferenceSubSection> */
|
||||
public readonly array $crossReferenceSubSections;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
public readonly Dictionary $dictionary,
|
||||
CrossReferenceSubSection... $crossReferenceSubSections,
|
||||
) {
|
||||
$this->crossReferenceSubSections = $crossReferenceSubSections;
|
||||
}
|
||||
|
||||
public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUseObject|CrossReferenceEntryCompressed|null {
|
||||
foreach ($this->crossReferenceSubSections as $crossReferenceSubSection) {
|
||||
if ($crossReferenceSubSection->containsObject($objNumber)) {
|
||||
return $crossReferenceSubSection->getCrossReferenceEntry($objNumber);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,54 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryCompressed;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryFreeObject;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
class CrossReferenceSubSection {
|
||||
/** @var array<CrossReferenceEntryInUseObject|CrossReferenceEntryFreeObject|CrossReferenceEntryCompressed> */
|
||||
public array $crossReferenceEntries = [];
|
||||
|
||||
/**
|
||||
* @phpstan-assert int<0, max> $nrOfEntries
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*
|
||||
* @no-named-arguments
|
||||
*/
|
||||
public function __construct(
|
||||
public readonly int $firstObjectNumber,
|
||||
public readonly int $nrOfEntries,
|
||||
CrossReferenceEntryInUseObject|CrossReferenceEntryFreeObject|CrossReferenceEntryCompressed... $crossReferenceEntries
|
||||
) {
|
||||
if ($this->nrOfEntries < 0) {
|
||||
throw new InvalidArgumentException('$nrOfEntries should be a positive number');
|
||||
}
|
||||
|
||||
$this->crossReferenceEntries = $crossReferenceEntries;
|
||||
}
|
||||
|
||||
public function containsObject(int $objNumber): bool {
|
||||
return $objNumber >= $this->firstObjectNumber
|
||||
&& $objNumber < $this->firstObjectNumber + $this->nrOfEntries;
|
||||
}
|
||||
|
||||
/** @throws RuntimeException */
|
||||
public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUseObject|CrossReferenceEntryCompressed|null {
|
||||
if (self::containsObject($objNumber) === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$object = $this->crossReferenceEntries[$objNumber - $this->firstObjectNumber]
|
||||
?? throw new RuntimeException(sprintf('Object with key %d not found', $objNumber - $this->firstObjectNumber));
|
||||
if ($object instanceof CrossReferenceEntryFreeObject) {
|
||||
throw new RuntimeException('Cross reference entry for object should point to either a compressed or uncompressed entry, not a free object nr');
|
||||
}
|
||||
|
||||
return $object;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry;
|
||||
|
||||
/** 7.5.8, Table 18, only present in crossReferenceStreams */
|
||||
class CrossReferenceEntryCompressed {
|
||||
/**
|
||||
* @see Table 18
|
||||
*
|
||||
* The object number of the object stream in which this object is
|
||||
* stored. (The generation number of the object stream shall be
|
||||
* implicitly 0.)
|
||||
*/
|
||||
final public const GENERATION_NUMBER = 0;
|
||||
|
||||
public function __construct(
|
||||
public readonly int $storedInStreamWithObjectNumber,
|
||||
public readonly int $indexOfThisObjectWithinObjectStream,
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry;
|
||||
|
||||
/** Present in both crossReferenceTable and crossReferenceStream */
|
||||
class CrossReferenceEntryFreeObject {
|
||||
public function __construct(
|
||||
public readonly int $objectNumberNextFreeObject,
|
||||
public readonly int $generationNumber,
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry;
|
||||
|
||||
/** Present in both crossReferenceTable and crossReferenceStream */
|
||||
class CrossReferenceEntryInUseObject {
|
||||
public function __construct(
|
||||
public readonly int $byteOffsetInDecodedStream,
|
||||
public readonly int $generationNumber,
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,83 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Stream;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\CrossReferenceSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\CrossReferenceSubSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryCompressed;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryFreeObject;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryParser;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\ArrayValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\CrossReferenceStreamByteSizes;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Integer\IntegerValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\TypeNameValue;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Marker;
|
||||
use PrinsFrank\PdfParser\Document\Object\Item\CompressedObject\CompressedObjectContent\CompressedObjectContentParser;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Stream\Stream;
|
||||
|
||||
/** @internal */
|
||||
class CrossReferenceStreamParser {
|
||||
private const HEX_CHARS_IN_BYTE = 2;
|
||||
|
||||
/**
|
||||
* @phpstan-assert int<0, max> $startPos
|
||||
* @phpstan-assert int<1, max> $nrOfBytes
|
||||
*
|
||||
* @throws PdfParserException
|
||||
*/
|
||||
public static function parse(Stream $stream, int $startPos, int $nrOfBytes): CrossReferenceSection {
|
||||
$dictionary = DictionaryParser::parse($stream, $startPos, $nrOfBytes);
|
||||
if ($dictionary->getType() !== TypeNameValue::X_REF) {
|
||||
throw new ParseFailureException('Expected stream of type xref');
|
||||
}
|
||||
|
||||
$wValue = $dictionary->getValueForKey(DictionaryKey::W, CrossReferenceStreamByteSizes::class)
|
||||
?? throw new ParseFailureException('Cross reference streams should have a dictionary entry for "W"');
|
||||
$startStream = $stream->getStartNextLineAfter(Marker::STREAM, $startPos, $startPos + $nrOfBytes)
|
||||
?? throw new ParseFailureException(sprintf('Unable to locate marker %s', Marker::STREAM->value));
|
||||
|
||||
if (($length = $dictionary->getValueForKey(DictionaryKey::LENGTH, IntegerValue::class)?->value) === null) {
|
||||
$endStream = $stream->lastPos(Marker::END_STREAM, $stream->getSizeInBytes() - $startPos + $nrOfBytes);
|
||||
if ($endStream === null || $endStream > ($startPos + $nrOfBytes)) {
|
||||
throw new ParseFailureException(sprintf('Expected end of stream content marked by %s, none found', Marker::END_STREAM->value));
|
||||
}
|
||||
|
||||
$length = $endStream - $startStream - 1;
|
||||
}
|
||||
|
||||
$entries = [];
|
||||
$hexContent = bin2hex(CompressedObjectContentParser::parseBinary($stream, $startStream, $length, $dictionary)->toString());
|
||||
foreach (str_split($hexContent, $wValue->getTotalLengthInBytes() * self::HEX_CHARS_IN_BYTE) as $referenceRow) {
|
||||
$field1 = hexdec(substr($referenceRow, 0, $wValue->lengthRecord1InBytes * self::HEX_CHARS_IN_BYTE));
|
||||
$field2 = hexdec(substr($referenceRow, $wValue->lengthRecord1InBytes * self::HEX_CHARS_IN_BYTE, $wValue->lengthRecord2InBytes * self::HEX_CHARS_IN_BYTE));
|
||||
$field3 = hexdec(substr($referenceRow, ($wValue->lengthRecord1InBytes + $wValue->lengthRecord2InBytes) * self::HEX_CHARS_IN_BYTE, $wValue->lengthRecord3InBytes * self::HEX_CHARS_IN_BYTE));
|
||||
if (!is_int($field1) || !is_int($field2) || !is_int($field3)) {
|
||||
throw new ParseFailureException(sprintf('Field 1, 2 and 3 in cross reference entries should be int, got %s, %s and %s', gettype($field1), gettype($field2), gettype($field3)));
|
||||
}
|
||||
|
||||
$entries[] = match (CrossReferenceStreamType::tryFrom($field1)) {
|
||||
CrossReferenceStreamType::LINKED_LIST_FREE_OBJECT => new CrossReferenceEntryFreeObject($field2, $field3),
|
||||
CrossReferenceStreamType::UNCOMPRESSED_OBJECT => new CrossReferenceEntryInUseObject($field2, $field3),
|
||||
CrossReferenceStreamType::COMPRESSED_OBJECT => new CrossReferenceEntryCompressed($field2, $field3),
|
||||
null => throw new ParseFailureException(sprintf('Unrecognized CrossReferenceStream type "%s"', $field1)),
|
||||
};
|
||||
}
|
||||
|
||||
/** @var list<int> $startObjNrOfItemsArray where all even items are the start object number and all odd items are the number of objects */
|
||||
$startObjNrOfItemsArray = $dictionary->getValueForKey(DictionaryKey::INDEX, ArrayValue::class)->value
|
||||
?? [0, $dictionary->getValueForKey(DictionaryKey::SIZE, IntegerValue::class)->value ?? throw new ParseFailureException('Cross reference streams should have either an index or a size, neither was found')];
|
||||
|
||||
$crossReferenceSubSections = [];
|
||||
foreach (array_chunk($startObjNrOfItemsArray, 2) as $startNrNrOfObjects) {
|
||||
/** @phpstan-ignore offsetAccess.notFound, offsetAccess.notFound */
|
||||
$crossReferenceSubSections[] = new CrossReferenceSubSection($startNrNrOfObjects[0], $startNrNrOfObjects[1], ... array_slice($entries, 0, $startNrNrOfObjects[1]));
|
||||
$entries = array_slice($entries, $startNrNrOfObjects[1]);
|
||||
}
|
||||
|
||||
return new CrossReferenceSection($dictionary, ... $crossReferenceSubSections);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Stream;
|
||||
|
||||
/** @internal */
|
||||
enum CrossReferenceStreamType: int {
|
||||
case LINKED_LIST_FREE_OBJECT = 0;
|
||||
case UNCOMPRESSED_OBJECT = 1;
|
||||
case COMPRESSED_OBJECT = 2;
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Table;
|
||||
|
||||
/** @internal */
|
||||
enum CrossReferenceTableInUseOrFree: string {
|
||||
case IN_USE = 'n';
|
||||
case FREE = 'f';
|
||||
}
|
||||
@ -0,0 +1,57 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\CrossReference\Table;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\CrossReferenceSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\CrossReferenceSubSection;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryFreeObject;
|
||||
use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryParser;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Character\WhitespaceCharacter;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Marker;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Stream\Stream;
|
||||
|
||||
/** @internal */
|
||||
class CrossReferenceTableParser {
|
||||
/** @throws PdfParserException */
|
||||
public static function parse(Stream $stream, int $startPos, int $nrOfBytes): CrossReferenceSection {
|
||||
$startTrailerPos = $stream->firstPos(Marker::TRAILER, $startPos, $startPos + $nrOfBytes)
|
||||
?? throw new ParseFailureException('Unable to locate trailer for crossReferenceTable');
|
||||
$dictionary = DictionaryParser::parse($stream, $startTrailerPos + Marker::TRAILER->length(), $nrOfBytes - ($startTrailerPos + Marker::TRAILER->length() - $startPos));
|
||||
|
||||
$firstObjectNumber = $nrOfEntries = null;
|
||||
$crossReferenceSubSections = $crossReferenceEntries = [];
|
||||
$content = trim($stream->read($startPos, $startTrailerPos - $startPos));
|
||||
$content = str_replace([WhitespaceCharacter::CARRIAGE_RETURN->value, WhitespaceCharacter::LINE_FEED->value . WhitespaceCharacter::LINE_FEED->value], WhitespaceCharacter::LINE_FEED->value, $content);
|
||||
foreach (explode(WhitespaceCharacter::LINE_FEED->value, $content) as $line) {
|
||||
$sections = explode(WhitespaceCharacter::SPACE->value, trim($line));
|
||||
switch (count($sections)) {
|
||||
case 2:
|
||||
if ($firstObjectNumber !== null && $nrOfEntries !== null) {
|
||||
$crossReferenceSubSections[] = new CrossReferenceSubSection($firstObjectNumber, $nrOfEntries, ... $crossReferenceEntries); // Use previous objectNr and nrOfEntries
|
||||
}
|
||||
$crossReferenceEntries = [];
|
||||
$firstObjectNumber = (int) $sections[0];
|
||||
$nrOfEntries = (int) $sections[1];
|
||||
break;
|
||||
case 3:
|
||||
$crossReferenceEntries[] = match (CrossReferenceTableInUseOrFree::tryFrom(trim($sections[2]))) {
|
||||
CrossReferenceTableInUseOrFree::IN_USE => new CrossReferenceEntryInUseObject((int) $sections[0], (int) $sections[1]),
|
||||
CrossReferenceTableInUseOrFree::FREE => new CrossReferenceEntryFreeObject((int) $sections[0], (int) $sections[1]),
|
||||
null => throw new ParseFailureException(sprintf('Unrecognized crossReference table record type %s', trim($sections[2])))
|
||||
};
|
||||
break;
|
||||
default:
|
||||
throw new ParseFailureException(sprintf('Invalid line "%s", 2 or 3 sections expected, %d found', substr(trim($line), 0, 30), count($sections)));
|
||||
}
|
||||
}
|
||||
|
||||
if ($firstObjectNumber !== null && $nrOfEntries !== null) {
|
||||
$crossReferenceSubSections[] = new CrossReferenceSubSection($firstObjectNumber, $nrOfEntries, ... $crossReferenceEntries);
|
||||
}
|
||||
|
||||
return new CrossReferenceSection($dictionary, ... $crossReferenceSubSections);
|
||||
}
|
||||
}
|
||||
143
includes/pdfparser/Document/Dictionary/Dictionary.php
Normal file
143
includes/pdfparser/Document/Dictionary/Dictionary.php
Normal file
@ -0,0 +1,143 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryEntry\DictionaryEntry;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\ExtendedDictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\DictionaryArrayValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\SubtypeNameValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\TypeNameValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValueArray;
|
||||
use PrinsFrank\PdfParser\Document\Document;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\DecoratedObject;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
class Dictionary {
|
||||
/** @var array<DictionaryEntry> */
|
||||
public readonly array $dictionaryEntries;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
DictionaryEntry... $dictionaryEntries
|
||||
) {
|
||||
$this->dictionaryEntries = $dictionaryEntries;
|
||||
}
|
||||
|
||||
/**
|
||||
* @template T of DictionaryValue|NameValue|Dictionary
|
||||
* @param class-string<T> $valueType
|
||||
* @return T
|
||||
*/
|
||||
public function getValueForKey(DictionaryKey|ExtendedDictionaryKey $dictionaryKey, string $valueType): DictionaryValue|Dictionary|NameValue|null {
|
||||
foreach ($this->dictionaryEntries as $dictionaryEntry) {
|
||||
if (($dictionaryKey instanceof DictionaryKey && $dictionaryEntry->key === $dictionaryKey)
|
||||
|| ($dictionaryKey instanceof ExtendedDictionaryKey && $dictionaryEntry->key instanceof ExtendedDictionaryKey && $dictionaryEntry->key->value === $dictionaryKey->value)) {
|
||||
$value = $dictionaryEntry->value;
|
||||
if (is_a($value, $valueType) === false) {
|
||||
throw new InvalidArgumentException(sprintf('Expected value with value %s to be of type %s, got %s', $dictionaryKey->value, $valueType, get_class($value)));
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/** @return class-string<DictionaryValue|NameValue|Dictionary> */
|
||||
public function getTypeForKey(DictionaryKey $dictionaryKey): ?string {
|
||||
foreach ($this->dictionaryEntries as $dictionaryEntry) {
|
||||
if ($dictionaryEntry->key === $dictionaryKey) {
|
||||
return $dictionaryEntry->value::class;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getSubDictionary(?Document $document, DictionaryKey $dictionaryKey): ?Dictionary {
|
||||
$subDictionaryType = $this->getTypeForKey($dictionaryKey);
|
||||
if ($subDictionaryType === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ($subDictionaryType === Dictionary::class) {
|
||||
return $this->getValueForKey($dictionaryKey, Dictionary::class) ?? throw new RuntimeException();
|
||||
}
|
||||
|
||||
if ($subDictionaryType === DictionaryArrayValue::class) {
|
||||
return ($this->getValueForKey($dictionaryKey, DictionaryArrayValue::class) ?? throw new RuntimeException())->toSingleDictionary();
|
||||
}
|
||||
|
||||
if ($subDictionaryType === ReferenceValue::class) {
|
||||
if ($document === null) {
|
||||
throw new ParseFailureException('Document is required to get subDictionary for reference');
|
||||
}
|
||||
|
||||
return ($this->getObjectForReference($document, $dictionaryKey) ?? throw new ParseFailureException())
|
||||
->getDictionary();
|
||||
}
|
||||
|
||||
throw new ParseFailureException(sprintf('Invalid type "%s" for subDictionary with key %s', $subDictionaryType, $dictionaryKey->name));
|
||||
}
|
||||
|
||||
/**
|
||||
* @template T of DecoratedObject
|
||||
* @param class-string<T>|null $expectedDecoratorFQN
|
||||
* @return ($expectedDecoratorFQN is null ? DecoratedObject : T)
|
||||
*/
|
||||
public function getObjectForReference(Document $document, DictionaryKey|ExtendedDictionaryKey $dictionaryKey, ?string $expectedDecoratorFQN = null): ?DecoratedObject {
|
||||
$reference = $this->getValueForKey($dictionaryKey, ReferenceValue::class);
|
||||
if ($reference === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $document->getObject($reference->objectNumber, $expectedDecoratorFQN)
|
||||
?? throw new ParseFailureException();
|
||||
}
|
||||
|
||||
/**
|
||||
* @template T of DecoratedObject
|
||||
* @param class-string<T>|null $expectedDecoratorFQN
|
||||
* @return ($expectedDecoratorFQN is null ? list<DecoratedObject> : list<T>)
|
||||
*/
|
||||
public function getObjectsForReference(Document $document, DictionaryKey|ExtendedDictionaryKey $dictionaryKey, ?string $expectedDecoratorFQN = null): array {
|
||||
$references = $this->getValueForKey($dictionaryKey, ReferenceValueArray::class);
|
||||
if ($references === null) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$objects = [];
|
||||
foreach ($references->referenceValues as $referenceValue) {
|
||||
$objects[] = $document->getObject($referenceValue->objectNumber, $expectedDecoratorFQN)
|
||||
?? throw new ParseFailureException();
|
||||
}
|
||||
|
||||
return $objects;
|
||||
}
|
||||
|
||||
public function getType(): ?TypeNameValue {
|
||||
if ($this->getTypeForKey(DictionaryKey::TYPE) === Dictionary::class) {
|
||||
return $this->getValueForKey(DictionaryKey::TYPE, Dictionary::class)
|
||||
?->getValueForKey(DictionaryKey::TYPE, TypeNameValue::class);
|
||||
}
|
||||
|
||||
return $this->getValueForKey(DictionaryKey::TYPE, TypeNameValue::class);
|
||||
}
|
||||
|
||||
public function getSubType(): ?SubtypeNameValue {
|
||||
if ($this->getTypeForKey(DictionaryKey::SUBTYPE) === Dictionary::class) {
|
||||
return $this->getValueForKey(DictionaryKey::SUBTYPE, Dictionary::class)
|
||||
?->getValueForKey(DictionaryKey::SUBTYPE, SubtypeNameValue::class);
|
||||
}
|
||||
|
||||
return $this->getValueForKey(DictionaryKey::SUBTYPE, SubtypeNameValue::class);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryEntry;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\ExtendedDictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue;
|
||||
|
||||
/** @api */
|
||||
class DictionaryEntry {
|
||||
public function __construct(
|
||||
public readonly DictionaryKey|ExtendedDictionaryKey $key,
|
||||
public readonly DictionaryValue|Dictionary|NameValue $value,
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryEntry;
|
||||
|
||||
use BackedEnum;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryFactory;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\ExtendedDictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\ArrayValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\TextString\TextStringValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Normalization\NameValueNormalizer;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
|
||||
/** @internal */
|
||||
class DictionaryEntryFactory {
|
||||
/**
|
||||
* @param string|array<string, mixed> $dictionaryValue
|
||||
* @throws PdfParserException
|
||||
*/
|
||||
public static function fromKeyValuePair(string $keyString, string|array $dictionaryValue): ?DictionaryEntry {
|
||||
$dictionaryKey = DictionaryKey::tryFromKeyString($keyString)
|
||||
?? ExtendedDictionaryKey::fromKeyString($keyString);
|
||||
|
||||
return new DictionaryEntry($dictionaryKey, self::getValue($dictionaryKey, $dictionaryValue));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string|array<string, mixed> $value
|
||||
* @throws PdfParserException
|
||||
*/
|
||||
protected static function getValue(DictionaryKey|ExtendedDictionaryKey $dictionaryKey, string|array $value): Dictionary|DictionaryValue|NameValue {
|
||||
$allowedValueTypes = $dictionaryKey->getValueTypes();
|
||||
if ((in_array(Dictionary::class, $allowedValueTypes, true) || in_array(ArrayValue::class, $allowedValueTypes, true))
|
||||
&& is_array($value)) {
|
||||
return DictionaryFactory::fromArray($value);
|
||||
}
|
||||
|
||||
if ((in_array(Dictionary::class, $allowedValueTypes, true) || in_array(ArrayValue::class, $allowedValueTypes, true))
|
||||
&& is_string($value)
|
||||
&& preg_match('/^[0-9]+ [0-9]+ R$/', $value) === 1
|
||||
&& ($referenceValue = ReferenceValue::fromValue($value)) !== null) {
|
||||
return $referenceValue;
|
||||
}
|
||||
|
||||
foreach ($allowedValueTypes as $allowedValueType) {
|
||||
if (is_a($allowedValueType, BackedEnum::class, true)
|
||||
&& is_string($value)
|
||||
&& ($resolvedValue = $allowedValueType::tryFrom(NameValueNormalizer::normalize($value))) !== null) {
|
||||
return $resolvedValue;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($allowedValueTypes as $allowedValueType) {
|
||||
if (!is_a($allowedValueType, DictionaryValue::class, true)
|
||||
|| $allowedValueType === TextStringValue::class) { // TextStrings accept everything, so we check that last
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_string($value) || ($valueObject = $allowedValueType::fromValue($value)) === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return $valueObject;
|
||||
}
|
||||
|
||||
if (in_array(TextStringValue::class, $allowedValueTypes, true) && is_string($value)) {
|
||||
return TextStringValue::fromValue($value);
|
||||
}
|
||||
|
||||
throw new ParseFailureException(sprintf('Value "%s" for dictionary key %s could not be parsed to a valid value type', is_array($value) ? 'array()' : $value, $dictionaryKey->value));
|
||||
}
|
||||
}
|
||||
34
includes/pdfparser/Document/Dictionary/DictionaryFactory.php
Normal file
34
includes/pdfparser/Document/Dictionary/DictionaryFactory.php
Normal file
@ -0,0 +1,34 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryEntry\DictionaryEntryFactory;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
|
||||
/** @internal */
|
||||
class DictionaryFactory {
|
||||
/**
|
||||
* @param array<string, mixed> $dictionaryArray
|
||||
* @throws PdfParserException
|
||||
*/
|
||||
public static function fromArray(array $dictionaryArray): Dictionary {
|
||||
$dictionaryEntries = [];
|
||||
foreach ($dictionaryArray as $keyString => $value) {
|
||||
if (!is_string($value) && (!is_array($value) || array_is_list($value))) {
|
||||
throw new InvalidArgumentException(sprintf('values should be either strings or non-list array, %s given', gettype($value)));
|
||||
}
|
||||
|
||||
/** @var non-empty-array<string, mixed>|string $value */
|
||||
$dictionaryEntry = DictionaryEntryFactory::fromKeyValuePair($keyString, $value);
|
||||
if ($dictionaryEntry === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$dictionaryEntries[] = $dictionaryEntry;
|
||||
}
|
||||
|
||||
return new Dictionary(... $dictionaryEntries);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,12 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue;
|
||||
|
||||
interface DictionaryKeyInterface {
|
||||
/** @return list<class-string<DictionaryValue|Dictionary|NameValue>> */
|
||||
public function getValueTypes(): array;
|
||||
}
|
||||
@ -0,0 +1,26 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\TextString\TextStringValue;
|
||||
|
||||
class ExtendedDictionaryKey implements DictionaryKeyInterface {
|
||||
public function __construct(
|
||||
public readonly string $value,
|
||||
) {
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
public static function fromKeyString(string $keyString): self {
|
||||
return new self(rtrim(ltrim($keyString, '/'), "\n\t "));
|
||||
}
|
||||
|
||||
/** @api */
|
||||
#[Override]
|
||||
public function getValueTypes(): array {
|
||||
return [ReferenceValue::class, TextStringValue::class, Dictionary::class];
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryParseContext;
|
||||
|
||||
/** @internal */
|
||||
enum DictionaryParseContext {
|
||||
case ROOT;
|
||||
case DICTIONARY;
|
||||
case KEY;
|
||||
case KEY_VALUE_SEPARATOR;
|
||||
case VALUE;
|
||||
case VALUE_IN_PARENTHESES;
|
||||
case VALUE_IN_SQUARE_BRACKETS;
|
||||
case VALUE_IN_ANGLE_BRACKETS;
|
||||
case COMMENT;
|
||||
}
|
||||
@ -0,0 +1,101 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryParseContext;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Generic\Parsing\InfiniteBuffer;
|
||||
|
||||
/** @internal */
|
||||
class NestingContext {
|
||||
private string $currentLevel;
|
||||
|
||||
/** @var array<string, DictionaryParseContext> */
|
||||
private array $nestingContext = [];
|
||||
|
||||
/** @var array<string, InfiniteBuffer> */
|
||||
private array $keyBuffer = [];
|
||||
|
||||
/** @var array<string, InfiniteBuffer> */
|
||||
private array $valueBuffer = [];
|
||||
|
||||
public function __construct() {
|
||||
$this->currentLevel = '';
|
||||
}
|
||||
|
||||
public function incrementNesting(): self {
|
||||
$this->currentLevel = (string) ($this->keyBuffer[$this->currentLevel] ?? (int) $this->currentLevel + 1);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function decrementNesting(): self {
|
||||
array_pop($this->nestingContext);
|
||||
$this->currentLevel = (string) array_key_last($this->nestingContext);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function setContext(DictionaryParseContext $dictionaryParseContext): self {
|
||||
$this->nestingContext[$this->currentLevel] = $dictionaryParseContext;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getContext(): DictionaryParseContext {
|
||||
return $this->nestingContext[$this->currentLevel] ?? DictionaryParseContext::ROOT;
|
||||
}
|
||||
|
||||
public function getKeyBuffer(): InfiniteBuffer {
|
||||
return $this->keyBuffer[$this->currentLevel] ??= new InfiniteBuffer();
|
||||
}
|
||||
|
||||
public function addToKeyBuffer(string $char): self {
|
||||
$this->getKeyBuffer()->addChar($char);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function removeFromKeyBuffer(int $nChars = 1): self {
|
||||
$this->getKeyBuffer()->removeChar($nChars);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getValueBuffer(): InfiniteBuffer {
|
||||
return $this->valueBuffer[$this->currentLevel] ??= new InfiniteBuffer();
|
||||
}
|
||||
|
||||
public function addToValueBuffer(string $char): self {
|
||||
$this->getValueBuffer()->addChar($char);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function removeFromValueBuffer(int $nChars = 1): self {
|
||||
$this->getValueBuffer()->removeChar($nChars);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/** @return list<string> */
|
||||
public function getKeysFromRoot(): array {
|
||||
$keysFromRoot = [];
|
||||
foreach ($this->keyBuffer as $keyBuffer) {
|
||||
$keyBufferString = (string) $keyBuffer;
|
||||
if ($keyBufferString === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$keysFromRoot[] = $keyBufferString;
|
||||
}
|
||||
|
||||
return $keysFromRoot;
|
||||
}
|
||||
|
||||
public function flush(): self {
|
||||
($this->valueBuffer[$this->currentLevel] ?? null)?->flush();
|
||||
($this->keyBuffer[$this->currentLevel] ?? null)?->flush();
|
||||
|
||||
return $this;
|
||||
}
|
||||
}
|
||||
112
includes/pdfparser/Document/Dictionary/DictionaryParser.php
Normal file
112
includes/pdfparser/Document/Dictionary/DictionaryParser.php
Normal file
@ -0,0 +1,112 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryParseContext\DictionaryParseContext;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryParseContext\NestingContext;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Character\DelimiterCharacter;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Character\LiteralStringEscapeCharacter;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Character\WhitespaceCharacter;
|
||||
use PrinsFrank\PdfParser\Document\Generic\Parsing\RollingCharBuffer;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Stream\Stream;
|
||||
|
||||
/** @internal */
|
||||
class DictionaryParser {
|
||||
/**
|
||||
* @phpstan-assert int<0, max> $startPos
|
||||
* @phpstan-assert int<1, max> $nrOfBytes
|
||||
*
|
||||
* @throws PdfParserException
|
||||
*/
|
||||
public static function parse(Stream $stream, int $startPos, int $nrOfBytes): Dictionary {
|
||||
$dictionaryArray = [];
|
||||
$rollingCharBuffer = new RollingCharBuffer(6);
|
||||
$nestingContext = (new NestingContext())->setContext(DictionaryParseContext::ROOT);
|
||||
$arrayNestingLevel = 0;
|
||||
foreach ($stream->chars($startPos, $nrOfBytes) as $char) {
|
||||
$rollingCharBuffer->next($char);
|
||||
if ($char === DelimiterCharacter::LESS_THAN_SIGN->value && $rollingCharBuffer->getPreviousCharacter() === DelimiterCharacter::LESS_THAN_SIGN->value && $rollingCharBuffer->getPreviousCharacter(2) !== LiteralStringEscapeCharacter::REVERSE_SOLIDUS->value && $nestingContext->getContext() !== DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS) {
|
||||
if ($nestingContext->getContext() === DictionaryParseContext::KEY) {
|
||||
$nestingContext->removeFromKeyBuffer();
|
||||
}
|
||||
|
||||
$nestingContext->setContext(DictionaryParseContext::DICTIONARY)->incrementNesting()->setContext(DictionaryParseContext::DICTIONARY);
|
||||
} elseif ($char === DelimiterCharacter::LESS_THAN_SIGN->value && $nestingContext->getContext() === DictionaryParseContext::KEY) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE);
|
||||
} elseif ($char === DelimiterCharacter::GREATER_THAN_SIGN->value && $rollingCharBuffer->getPreviousCharacter() === DelimiterCharacter::GREATER_THAN_SIGN->value && $rollingCharBuffer->getPreviousCharacter(2) !== LiteralStringEscapeCharacter::REVERSE_SOLIDUS->value && $nestingContext->getContext() !== DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS) {
|
||||
$nestingContext->removeFromValueBuffer();
|
||||
self::flush($dictionaryArray, $nestingContext);
|
||||
$nestingContext->decrementNesting()->flush();
|
||||
} elseif ($char === DelimiterCharacter::SOLIDUS->value && $rollingCharBuffer->getPreviousCharacter() !== LiteralStringEscapeCharacter::REVERSE_SOLIDUS->value && $nestingContext->getContext() !== DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS) {
|
||||
if ($nestingContext->getContext() === DictionaryParseContext::DICTIONARY) {
|
||||
$nestingContext->setContext(DictionaryParseContext::KEY);
|
||||
} elseif ($nestingContext->getContext() === DictionaryParseContext::VALUE) {
|
||||
self::flush($dictionaryArray, $nestingContext);
|
||||
$nestingContext->setContext(DictionaryParseContext::KEY);
|
||||
} elseif ($nestingContext->getContext() === DictionaryParseContext::KEY || $nestingContext->getContext() === DictionaryParseContext::KEY_VALUE_SEPARATOR) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE);
|
||||
}
|
||||
} elseif ($char === WhitespaceCharacter::LINE_FEED->value && $nestingContext->getContext() !== DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS) {
|
||||
if ($nestingContext->getContext() === DictionaryParseContext::KEY) {
|
||||
$nestingContext->setContext(DictionaryParseContext::KEY_VALUE_SEPARATOR);
|
||||
} elseif ($nestingContext->getContext() === DictionaryParseContext::VALUE) {
|
||||
self::flush($dictionaryArray, $nestingContext);
|
||||
} elseif ($nestingContext->getContext() === DictionaryParseContext::COMMENT) {
|
||||
$nestingContext->setContext(DictionaryParseContext::DICTIONARY);
|
||||
}
|
||||
} elseif (WhitespaceCharacter::tryFrom($char) !== null && $nestingContext->getContext() === DictionaryParseContext::KEY) {
|
||||
$nestingContext->setContext(DictionaryParseContext::KEY_VALUE_SEPARATOR);
|
||||
} elseif ($char === DelimiterCharacter::LEFT_PARENTHESIS->value && (in_array($nestingContext->getContext(), [DictionaryParseContext::KEY, DictionaryParseContext::KEY_VALUE_SEPARATOR, DictionaryParseContext::VALUE], true))) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE_IN_PARENTHESES);
|
||||
} elseif ($char === DelimiterCharacter::RIGHT_PARENTHESIS->value && $rollingCharBuffer->getPreviousCharacter() !== LiteralStringEscapeCharacter::REVERSE_SOLIDUS->value && $nestingContext->getContext() === DictionaryParseContext::VALUE_IN_PARENTHESES) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE);
|
||||
} elseif ($char === DelimiterCharacter::LEFT_SQUARE_BRACKET->value && (in_array($nestingContext->getContext(), [DictionaryParseContext::KEY, DictionaryParseContext::KEY_VALUE_SEPARATOR, DictionaryParseContext::VALUE, DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS], true))) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS);
|
||||
$arrayNestingLevel++;
|
||||
} elseif ($char === DelimiterCharacter::RIGHT_SQUARE_BRACKET->value && $nestingContext->getContext() === DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS) {
|
||||
$arrayNestingLevel--;
|
||||
if ($arrayNestingLevel === 0) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE);
|
||||
}
|
||||
} elseif (trim($char) !== '' && $nestingContext->getContext() === DictionaryParseContext::KEY_VALUE_SEPARATOR) {
|
||||
$nestingContext->setContext(DictionaryParseContext::VALUE);
|
||||
} elseif ($char === DelimiterCharacter::PERCENT_SIGN->value && $rollingCharBuffer->getPreviousCharacter() !== LiteralStringEscapeCharacter::REVERSE_SOLIDUS->value && $nestingContext->getContext() !== DictionaryParseContext::VALUE_IN_PARENTHESES) {
|
||||
$nestingContext->setContext(DictionaryParseContext::COMMENT);
|
||||
}
|
||||
|
||||
match ($nestingContext->getContext()) {
|
||||
DictionaryParseContext::KEY => $nestingContext->addToKeyBuffer($char),
|
||||
DictionaryParseContext::VALUE_IN_PARENTHESES,
|
||||
DictionaryParseContext::VALUE_IN_SQUARE_BRACKETS,
|
||||
DictionaryParseContext::VALUE => $nestingContext->addToValueBuffer($char),
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
|
||||
return DictionaryFactory::fromArray($dictionaryArray);
|
||||
}
|
||||
|
||||
/** @param array<string, mixed> $dictionaryArray */
|
||||
private static function flush(array &$dictionaryArray, NestingContext $nestingContext): void {
|
||||
if ($nestingContext->getValueBuffer()->isEmpty() || $nestingContext->getKeyBuffer()->isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
$dictionaryArrayPointer = &$dictionaryArray;
|
||||
$keys = $nestingContext->getKeysFromRoot();
|
||||
foreach ($keys as $index => $key) {
|
||||
if ($key === (string) $nestingContext->getKeyBuffer() && $index === array_key_last($keys)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/** @phpstan-ignore offsetAccess.nonOffsetAccessible */
|
||||
$dictionaryArrayPointer = &$dictionaryArrayPointer[trim($key)];
|
||||
}
|
||||
|
||||
/** @phpstan-ignore offsetAccess.nonOffsetAccessible */
|
||||
$dictionaryArrayPointer[(string) $nestingContext->getKeyBuffer()] = trim((string) $nestingContext->getValueBuffer());
|
||||
$nestingContext->flush();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,69 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValueArray;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
/** @api */
|
||||
class ArrayValue implements DictionaryValue {
|
||||
/** @param list<int|string|ArrayValue|ReferenceValueArray|null> $value */
|
||||
public function __construct(
|
||||
public readonly array $value
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
/** @throws PdfParserException */
|
||||
public static function fromValue(string $valueString): null|self|ReferenceValueArray {
|
||||
$valueString = trim($valueString);
|
||||
if (!str_starts_with($valueString, '[') || !str_ends_with($valueString, ']')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$valueString = preg_replace('/(<[^>]*>)(?=<[^>]*>)/', '$1 $2', $valueString)
|
||||
?? throw new RuntimeException('An error occurred while sanitizing array value');
|
||||
$valueString = str_replace(['/', "\n"], [' /', ' '], rtrim(ltrim($valueString, '[ '), ' ]'));
|
||||
$valueString = preg_replace('/\s+/', ' ', $valueString)
|
||||
?? throw new RuntimeException('An error occurred while removing duplicate spaces from array value');
|
||||
$values = explode(' ', $valueString);
|
||||
if (count($values) % 3 === 0 && array_key_exists(2, $values) && $values[2] === 'R') {
|
||||
return ReferenceValueArray::fromValue($valueString);
|
||||
}
|
||||
|
||||
$array = [];
|
||||
foreach ($values as $value) {
|
||||
if (str_starts_with($value, '[') && str_ends_with($value, ']')) {
|
||||
$array[] = self::fromValue($value);
|
||||
} elseif ((string) (int) $value === $value) {
|
||||
$array[] = (int) $value;
|
||||
} elseif ($value !== '') {
|
||||
$array[] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
return new self($array);
|
||||
}
|
||||
|
||||
public function toString(): string {
|
||||
$string = '';
|
||||
foreach ($this->value as $value) {
|
||||
$string .= ' ' . match (true) {
|
||||
is_int($value),
|
||||
is_float($value),
|
||||
is_string($value) => $value,
|
||||
$value instanceof ArrayValue => $value->toString(),
|
||||
$value instanceof ReferenceValueArray => implode(' ', array_map(fn (ReferenceValue $referenceValue) => $referenceValue->objectNumber . ' R', $value->referenceValues)),
|
||||
default => throw new ParseFailureException('Unsupported array value type: ' . gettype($value)),
|
||||
};
|
||||
}
|
||||
|
||||
return '[' . trim($string) . ']';
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item\ConsecutiveCIDWidth;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item\RangeCIDWidth;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
|
||||
/** @see 9.7.4.3 Glyph metrics in CIDFonts */
|
||||
class CIDFontWidths implements DictionaryValue {
|
||||
/** @var list<ConsecutiveCIDWidth|RangeCIDWidth> */
|
||||
private readonly array $widths;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
ConsecutiveCIDWidth|RangeCIDWidth ...$widths,
|
||||
) {
|
||||
$this->widths = $widths;
|
||||
}
|
||||
|
||||
public function getWidthForCharacter(int $characterCode): ?float {
|
||||
foreach ($this->widths as $widthItem) {
|
||||
if (($widthForCharacterCode = $widthItem->getWidthForCharacterCode($characterCode)) !== null) {
|
||||
return $widthForCharacterCode;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
$valueString = str_replace("\n", ' ', $valueString);
|
||||
if (preg_match_all('/(?<startingCID>[0-9]+)\s*(?<CIDS>[0-9]+\s*[0-9.]+|\[[0-9. ]+\])/', $valueString, $matches, PREG_SET_ORDER) <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$widths = [];
|
||||
foreach ($matches as $match) {
|
||||
if ((string) ($startingCID = (int) $match['startingCID']) !== $match['startingCID']) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (str_starts_with($match['CIDS'], '[') && str_ends_with($match['CIDS'], ']')) {
|
||||
$widths[] = new ConsecutiveCIDWidth($startingCID, array_map('floatval', explode(' ', rtrim(ltrim($match['CIDS'], '['), ']'))));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$arguments = explode(' ', $match['CIDS']);
|
||||
if (count($arguments) !== 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ((string)($endCID = (int) $arguments[0]) !== $arguments[0] || (string)($width = (float) $arguments[1]) !== $arguments[1]) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$widths[] = new RangeCIDWidth($startingCID, $endCID, $width);
|
||||
}
|
||||
|
||||
return new self(... $widths);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
/** @api */
|
||||
class CrossReferenceStreamByteSizes implements DictionaryValue {
|
||||
public function __construct(
|
||||
public readonly int $lengthRecord1InBytes,
|
||||
public readonly int $lengthRecord2InBytes,
|
||||
public readonly int $lengthRecord3InBytes,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws RuntimeException
|
||||
* @return int<1, max>
|
||||
*/
|
||||
public function getTotalLengthInBytes(): int {
|
||||
$totalLength = $this->lengthRecord1InBytes + $this->lengthRecord2InBytes + $this->lengthRecord3InBytes;
|
||||
if ($totalLength < 1) {
|
||||
throw new RuntimeException(sprintf('Total length should not be less than 1, got %d', $totalLength));
|
||||
}
|
||||
|
||||
return $totalLength;
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
if (!str_starts_with($valueString, '[') || !str_ends_with($valueString, ']')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$values = explode(' ', trim(rtrim(ltrim($valueString, '['), ']')));
|
||||
if (count($values) !== 3) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ((string) (int) trim($values[0]) !== trim($values[0])
|
||||
|| (string) (int) trim($values[1]) !== trim($values[1])
|
||||
|| (string) (int) trim($values[2]) !== trim($values[2])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new self((int) $values[0], (int) $values[1], (int) $values[2]);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryParser;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Exception\PdfParserException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
use PrinsFrank\PdfParser\Stream\InMemoryStream;
|
||||
|
||||
class DictionaryArrayValue implements DictionaryValue {
|
||||
/** @var list<Dictionary> */
|
||||
public readonly array $dictionaries;
|
||||
|
||||
/** @no-named-arguments */
|
||||
public function __construct(
|
||||
Dictionary... $dictionaries,
|
||||
) {
|
||||
$this->dictionaries = $dictionaries;
|
||||
}
|
||||
|
||||
#[Override]
|
||||
/** @throws PdfParserException */
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
$valueStringWithoutSpaces = str_replace([' ', "\r", "\n"], '', $valueString);
|
||||
if ((str_starts_with($valueStringWithoutSpaces, '[<<') === false && str_starts_with($valueStringWithoutSpaces, '[null') === false)
|
||||
|| (str_ends_with($valueStringWithoutSpaces, '>>]') === false && str_ends_with($valueStringWithoutSpaces, 'null]') === false)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$dictionaryEntries = [];
|
||||
$valueString = preg_replace('/(<<[^>]*>>)(?=<<[^>]*>>)/', '$1 $2', $valueString)
|
||||
?? throw new RuntimeException('An error occurred while sanitizing dictionary array value');
|
||||
foreach (explode('>> <<', substr($valueString, 3, -3)) as $dictionaryValueString) {
|
||||
$dictionaryEntries[] = $dictionaryValueString === ''
|
||||
? new Dictionary()
|
||||
: DictionaryParser::parse($memoryStream = new InMemoryStream('<<' . $dictionaryValueString . '>>'), 0, $memoryStream->getSizeInBytes());
|
||||
}
|
||||
|
||||
return new self(... $dictionaryEntries);
|
||||
}
|
||||
|
||||
public function toSingleDictionary(): ?Dictionary {
|
||||
$dictionaryEntries = [];
|
||||
foreach ($this->dictionaries as $dictionary) {
|
||||
foreach ($dictionary->dictionaryEntries as $dictionaryEntry) {
|
||||
$dictionaryEntries[] = $dictionaryEntry;
|
||||
}
|
||||
}
|
||||
|
||||
return new Dictionary(... $dictionaryEntries);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,57 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\GlyphLists\AGlyphList;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item\DifferenceRange;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValueArray;
|
||||
|
||||
class DifferencesArrayValue implements DictionaryValue {
|
||||
/** @param list<DifferenceRange> $differenceRanges */
|
||||
public function __construct(
|
||||
private readonly array $differenceRanges,
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
if (($arrayValue = ArrayValue::fromValue($valueString)) === null || $arrayValue instanceof ReferenceValueArray) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$startIndex = null;
|
||||
$characters = $differenceRanges = [];
|
||||
foreach ($arrayValue->value as $arrayValueItem) {
|
||||
if (is_int($arrayValueItem)) {
|
||||
if ($startIndex !== null) {
|
||||
$differenceRanges[] = new DifferenceRange($startIndex, $characters);
|
||||
$characters = [];
|
||||
}
|
||||
|
||||
$startIndex = $arrayValueItem;
|
||||
} elseif (is_string($arrayValueItem)) {
|
||||
$characters[] = AGlyphList::tryFrom(ltrim($arrayValueItem, '/'));
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if ($startIndex !== null) {
|
||||
$differenceRanges[] = new DifferenceRange($startIndex, $characters);
|
||||
}
|
||||
|
||||
return new self($differenceRanges);
|
||||
}
|
||||
|
||||
public function getGlyph(int $int): ?AGlyphList {
|
||||
foreach ($this->differenceRanges as $differenceRange) {
|
||||
if ($differenceRange->contains($int)) {
|
||||
return $differenceRange->getGlyph($int);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item;
|
||||
|
||||
class ConsecutiveCIDWidth {
|
||||
/** @param list<float> $widths */
|
||||
public function __construct(
|
||||
public readonly int $cidStart,
|
||||
public readonly array $widths,
|
||||
) {
|
||||
}
|
||||
|
||||
public function getWidthForCharacterCode(int $characterCode): ?float {
|
||||
if (array_key_exists($characterCode - $this->cidStart, $this->widths) === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->widths[$characterCode - $this->cidStart] / 1000;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item;
|
||||
|
||||
use PrinsFrank\GlyphLists\AGlyphList;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\RuntimeException;
|
||||
|
||||
class DifferenceRange {
|
||||
/** @param list<AGlyphList|null> $characters */
|
||||
public function __construct(
|
||||
private readonly int $firstIndex,
|
||||
private readonly array $characters,
|
||||
) {
|
||||
}
|
||||
|
||||
public function contains(int $index): bool {
|
||||
return $index >= $this->firstIndex
|
||||
&& $index < $this->firstIndex + count($this->characters);
|
||||
}
|
||||
|
||||
public function getGlyph(int $index): ?AGlyphList {
|
||||
if (!$this->contains($index)) {
|
||||
throw new InvalidArgumentException('This difference range does not contain index ' . $index);
|
||||
}
|
||||
|
||||
if (!array_key_exists($index - $this->firstIndex, $this->characters)) {
|
||||
throw new RuntimeException('Expected glyph to be present, but it was not');
|
||||
}
|
||||
|
||||
return $this->characters[$index - $this->firstIndex];
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Array\Item;
|
||||
|
||||
class RangeCIDWidth {
|
||||
public function __construct(
|
||||
public readonly int $cidStart,
|
||||
public readonly int $cidEnd,
|
||||
public readonly float $width,
|
||||
) {
|
||||
}
|
||||
|
||||
public function getWidthForCharacterCode(int $characterCode): ?float {
|
||||
if ($characterCode < $this->cidStart || $characterCode > $this->cidEnd) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->width / 1000;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Boolean;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
|
||||
/** @api */
|
||||
class BooleanValue implements DictionaryValue {
|
||||
public function __construct(
|
||||
public readonly bool $value,
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
if ($valueString === 'true') {
|
||||
return new self(true);
|
||||
}
|
||||
|
||||
if ($valueString === 'false') {
|
||||
return new self(false);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Date;
|
||||
|
||||
use DateTimeImmutable;
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
use PrinsFrank\PdfParser\Exception\InvalidArgumentException;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
use ValueError;
|
||||
|
||||
/** @api */
|
||||
class DateValue implements DictionaryValue {
|
||||
public function __construct(
|
||||
public readonly ?DateTimeImmutable $value
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
if (str_starts_with($valueString, '<') && str_ends_with($valueString, '>')) {
|
||||
$valueString = substr($valueString, 1, -1);
|
||||
if (!ctype_xdigit($valueString) || strlen($valueString) % 2 !== 0) {
|
||||
throw new InvalidArgumentException(sprintf('String "%s" is not hexadecimal', substr($valueString, 0, 10)));
|
||||
}
|
||||
|
||||
$valueString = hex2bin($valueString);
|
||||
if ($valueString === false) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (str_starts_with($valueString, '(') && str_ends_with($valueString, ')')) {
|
||||
$valueString = preg_replace_callback(
|
||||
'/\\\\([0-7]{3})/',
|
||||
fn (array $matches) => mb_chr((int) octdec($matches[1])),
|
||||
substr($valueString, 1, -1)
|
||||
) ?? throw new ParseFailureException();
|
||||
}
|
||||
|
||||
if (!str_starts_with($valueString, 'D:')) {
|
||||
$valueString = mb_convert_encoding($valueString, 'UTF-8', 'UTF-16');
|
||||
if ($valueString === false || !str_starts_with($valueString, 'D:')) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$parsedDate = DateTimeImmutable::createFromFormat(
|
||||
preg_match('/^D:\d{14}$/', $valueString) === 1 ? '\D\:YmdHis' : '\D\:YmdHisP',
|
||||
str_replace("'", '', $valueString)
|
||||
);
|
||||
} catch (ValueError) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ($parsedDate === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new self($parsedDate);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue;
|
||||
|
||||
interface DictionaryValue {
|
||||
public static function fromValue(string $valueString): ?self;
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Float;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
|
||||
/** @api */
|
||||
class FloatValue implements DictionaryValue {
|
||||
public function __construct(
|
||||
public readonly float $value
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
$valueAsFloat = (float) $valueString;
|
||||
if (number_format($valueAsFloat, (int) strpos(strrev($valueString), ".")) !== $valueString) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new self($valueAsFloat);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Integer;
|
||||
|
||||
use Override;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue;
|
||||
|
||||
/** @api */
|
||||
class IntegerValue implements DictionaryValue {
|
||||
public function __construct(
|
||||
public readonly int $value
|
||||
) {
|
||||
}
|
||||
|
||||
#[Override]
|
||||
public static function fromValue(string $valueString): ?self {
|
||||
$valueAsInt = (int) $valueString;
|
||||
if ((string) $valueAsInt !== $valueString) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new self($valueAsInt);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum AuthEventNameValue: string implements NameValue {
|
||||
case DocOpen = 'DocOpen';
|
||||
case EFOpen = 'EFOpen';
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum BlendModeNameValue: string implements NameValue {
|
||||
case Normal = 'Normal';
|
||||
case Compatible = 'Compatible';
|
||||
case Multiply = 'Multiply';
|
||||
case Screen = 'Screen';
|
||||
case Overlay = 'Overlay';
|
||||
case Darken = 'Darken';
|
||||
case Lighten = 'Lighten';
|
||||
case ColorDodge = 'ColorDodge';
|
||||
case ColorBurn = 'ColorBurn';
|
||||
case HardLight = 'HardLight';
|
||||
case SoftLight = 'SoftLight';
|
||||
case Difference = 'Difference';
|
||||
case Exclusion = 'Exclusion';
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum BorderStyleNameValue: string implements NameValue {
|
||||
case Solid = 'S';
|
||||
case Dashed = 'D';
|
||||
case Beveled = 'B';
|
||||
case Inset = 'I';
|
||||
case Underline = 'U';
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum CFMNameValue: string implements NameValue {
|
||||
case None = 'None';
|
||||
case V2 = 'V2';
|
||||
case AESV2 = 'AESV2';
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum CIEColorSpaceNameValue: string implements NameValue {
|
||||
case CalGray = 'CalGray';
|
||||
case CalRGB = 'CalRGB';
|
||||
case Lab = 'Lab';
|
||||
case ICCBased = 'ICCBased';
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Image\ColorSpace\Components;
|
||||
|
||||
enum DeviceColorSpaceNameValue: string implements NameValue {
|
||||
case DeviceGray = 'DeviceGray';
|
||||
case DeviceRGB = 'DeviceRGB';
|
||||
case DeviceCMYK = 'DeviceCMYK';
|
||||
|
||||
public function getComponents(): Components {
|
||||
return match ($this) {
|
||||
self::DeviceGray => Components::Gray,
|
||||
self::DeviceRGB => Components::RGB,
|
||||
self::DeviceCMYK => Components::CMYK,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum DirectionNameValue: string implements NameValue {
|
||||
case L2R = 'L2R';
|
||||
case R2L = 'R2L';
|
||||
}
|
||||
@ -0,0 +1,26 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\CMap\Registry\Adobe\Identity0;
|
||||
use PrinsFrank\PdfParser\Document\Encoding\MacRoman;
|
||||
use PrinsFrank\PdfParser\Document\Encoding\WinAnsi;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
enum EncodingNameValue: string implements NameValue {
|
||||
case IdentityV = 'Identity-V';
|
||||
case IdentityH = 'Identity-H';
|
||||
case MacRomanEncoding = 'MacRomanEncoding';
|
||||
case MacExpertEncoding = 'MacExpertEncoding';
|
||||
case WinAnsiEncoding = 'WinAnsiEncoding';
|
||||
|
||||
public function decodeString(string $characterGroup): string {
|
||||
return match ($this) {
|
||||
self::IdentityH,
|
||||
self::IdentityV => (new Identity0())->getToUnicodeCMap()->textToUnicode($characterGroup),
|
||||
self::WinAnsiEncoding => WinAnsi::textToUnicode($characterGroup),
|
||||
self::MacRomanEncoding => MacRoman::textToUnicode($characterGroup),
|
||||
default => throw new ParseFailureException(sprintf('Unsupported encoding %s', $this->name)),
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum EventNameValue: string implements NameValue {
|
||||
case View = 'View';
|
||||
case Print = 'Print';
|
||||
case Export = 'Export';
|
||||
}
|
||||
@ -0,0 +1,75 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\Dictionary;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey;
|
||||
use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Integer\IntegerValue;
|
||||
use PrinsFrank\PdfParser\Document\Document;
|
||||
use PrinsFrank\PdfParser\Document\Filter\Decode\ASCII85Decode;
|
||||
use PrinsFrank\PdfParser\Document\Filter\Decode\CCITTFaxDecode;
|
||||
use PrinsFrank\PdfParser\Document\Filter\Decode\FlateDecode;
|
||||
use PrinsFrank\PdfParser\Document\Filter\Decode\LZWFlatePredictorValue;
|
||||
use PrinsFrank\PdfParser\Document\Image\ImageType;
|
||||
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
||||
|
||||
enum FilterNameValue: string implements NameValue {
|
||||
case ASCII_HEX_DECODE = 'ASCIIHexDecode';
|
||||
case ASCII_85_DECODE = 'ASCII85Decode';
|
||||
case LZW_DECODE = 'LZWDecode';
|
||||
case FLATE_DECODE = 'FlateDecode';
|
||||
case RUN_LENGTH_DECODE = 'RunLengthDecode';
|
||||
case CCITT_FAX_DECODE = 'CCITTFaxDecode';
|
||||
case JBIG2_DECODE = 'JBIG2Decode';
|
||||
case DCT_DECODE = 'DCTDecode'; // Grayscale or color image data encoded in JPEG baseline format
|
||||
case JPX_DECODE = 'JPXDecode';
|
||||
case CRYPT = 'Crypt';
|
||||
case ADOBE_PPK_LITE = 'Adobe.PPKLite';
|
||||
case ADOBE_PUB_SEC = 'Adobe.PubSec';
|
||||
case ENTRUST_PPKEF = 'Entrust.PPKEF';
|
||||
case CICI_SIGN_IT = 'CIC.SignIt';
|
||||
case VERISIGN_PPKVS = 'Verisign.PPKVS';
|
||||
|
||||
public function decodeBinary(string $content, ?Dictionary $dictionary, ?Document $document): string {
|
||||
$decodeParams = $dictionary?->getSubDictionary($document, DictionaryKey::DECODE_PARMS);
|
||||
|
||||
return match($this) {
|
||||
self::JPX_DECODE,
|
||||
self::JBIG2_DECODE,
|
||||
self::DCT_DECODE => $content, // Don't decode JPEG content
|
||||
self::FLATE_DECODE => FlateDecode::decodeBinary(
|
||||
$content,
|
||||
$decodeParams !== null && ($predictorValue = LZWFlatePredictorValue::tryFrom((int) $decodeParams->getValueForKey(DictionaryKey::PREDICTOR, IntegerValue::class)?->value)) !== null
|
||||
? $predictorValue
|
||||
: LZWFlatePredictorValue::None,
|
||||
$decodeParams?->getValueForKey(DictionaryKey::COLUMNS, IntegerValue::class)->value ?? 1
|
||||
),
|
||||
self::CCITT_FAX_DECODE => CCITTFaxDecode::addHeaderAndIFD(
|
||||
$content,
|
||||
$decodeParams?->getValueForKey(DictionaryKey::COLUMNS, IntegerValue::class)->value
|
||||
?? throw new ParseFailureException('Missing columns'),
|
||||
$decodeParams->getValueForKey(DictionaryKey::ROWS, IntegerValue::class)->value
|
||||
?? $dictionary->getValueForKey(DictionaryKey::HEIGHT, IntegerValue::class)->value
|
||||
?? throw new ParseFailureException('Missing rows'),
|
||||
$decodeParams->getValueForKey(DictionaryKey::K, IntegerValue::class)->value
|
||||
?? throw new ParseFailureException('Missing K'),
|
||||
),
|
||||
self::ASCII_85_DECODE => ASCII85Decode::decodeBinary($content),
|
||||
default => throw new ParseFailureException(sprintf('Content "%.100s..." cannot be decoded for filter "%s"', $content, $this->name))
|
||||
};
|
||||
}
|
||||
|
||||
public function getImageType(): ?ImageType {
|
||||
return match ($this) {
|
||||
self::LZW_DECODE => ImageType::TIFF,
|
||||
self::FLATE_DECODE => ImageType::PNG,
|
||||
self::RUN_LENGTH_DECODE => ImageType::RAW,
|
||||
self::CCITT_FAX_DECODE => ImageType::TIFF_FAX,
|
||||
self::DCT_DECODE => ImageType::JPEG,
|
||||
self::JPX_DECODE => ImageType::JPEG2000,
|
||||
self::JBIG2_DECODE => ImageType::JBIG2,
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum IntentNameValue: string implements NameValue {
|
||||
case All = 'All';
|
||||
case View = 'View';
|
||||
case Design = 'Design';
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum ListModeNameValue: string implements NameValue {
|
||||
case AllPages = 'AllPages';
|
||||
case VisiblePages = 'VisiblePages';
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
/** @api */
|
||||
interface NameValue {
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum NonFullScreenPageModeNameValue: string implements NameValue {
|
||||
case UseNone = 'UseNone';
|
||||
case UseOutlines = 'UseOutlines';
|
||||
case UseThumbs = 'UseThumbs';
|
||||
case UseOC = 'UseOC';
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum NumberingStyleNameValue: string implements NameValue {
|
||||
case DecimalArabic = 'D';
|
||||
case UpperCaseRomanNumerals = 'R';
|
||||
case LowerCaseRomanNumerals = 'r';
|
||||
case UpperCaseLetters = 'A';
|
||||
case LowerCaseLetters = 'a';
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum PageLayoutNameValue: string implements NameValue {
|
||||
case SinglePage = 'SinglePage';
|
||||
case OneColumn = 'OneColumn';
|
||||
case TwoColumnLeft = 'TwoColumnLeft';
|
||||
case TwoColumnRight = 'TwoColumnRight';
|
||||
case TwoPageLeft = 'TwoPageLeft';
|
||||
case TwoPageRight = 'TwoPageRight';
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum PageModeNameValue: string implements NameValue {
|
||||
case USE_NONE = 'UseNone';
|
||||
case USE_OUTLINES = 'UseOutlines';
|
||||
case USE_THUMBS = 'UseThumbs';
|
||||
case FULL_SCREEN = 'FullScreen';
|
||||
case USE_O_C = 'UseOC';
|
||||
case USE_ATTACHMENTS = 'UseAttachments';
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum PaperHandlingNameValue: string implements NameValue {
|
||||
case Simplex = 'Simplex';
|
||||
case DuplexFlipShortEdge = 'DuplexFlipShortEdge';
|
||||
case DuplexFlipLongEdge = 'DuplexFlipLongEdge';
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum RenderingIntentNameValue: string implements NameValue {
|
||||
case AbsoluteColorimetric = 'AbsoluteColorimetric';
|
||||
case RelativeColorimetric = 'RelativeColorimetric';
|
||||
case Saturation = 'Saturation';
|
||||
case Perceptual = 'Perceptual';
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum SecurityHandlerNameValue: string implements NameValue {
|
||||
case Standard = 'Standard';
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum SpecialColorSpaceNameValue: string implements NameValue {
|
||||
case Pattern = 'Pattern';
|
||||
case Indexed = 'Indexed';
|
||||
case DeviceN = 'DeviceN';
|
||||
case Separation = 'Separation';
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum SubtypeNameValue: string implements NameValue {
|
||||
case CID_FONT_TYPE_0 = 'CIDFontType0';
|
||||
case CID_FONT_TYPE_0_C = 'CIDFontType0C';
|
||||
case CID_FONT_TYPE_2 = 'CIDFontType2';
|
||||
case FORM = 'Form';
|
||||
case IMAGE = 'Image';
|
||||
case LINK = 'Link';
|
||||
case STREAM = 'Stream';
|
||||
case TRUE_TYPE = 'TrueType';
|
||||
case TYPE_0 = 'Type0';
|
||||
case TYPE_1 = 'Type1';
|
||||
case TYPE_1_C = 'Type1C';
|
||||
case TYPE_3 = 'Type3';
|
||||
case XML = 'XML';
|
||||
case TEXT = 'Text';
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum TabsNameValue: string implements NameValue {
|
||||
case RowOrder = 'R';
|
||||
case ColumnOrder = 'C';
|
||||
case StructureOrder = 'S';
|
||||
|
||||
/** @since PDF2.0 */
|
||||
case AnnotationsArrayOrder = 'A';
|
||||
|
||||
/** @since PDF2.0 */
|
||||
case WidgetOrder = 'W';
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum TransitionStyleNameValue: string implements NameValue {
|
||||
case Split = 'Split';
|
||||
case Blinds = 'Blinds';
|
||||
case Box = 'Box';
|
||||
case Wipe = 'Wipe';
|
||||
case Dissolve = 'Dissolve';
|
||||
case Glitter = 'Glitter';
|
||||
case R = 'R';
|
||||
case Fly = 'Fly';
|
||||
case Push = 'Push';
|
||||
case Cover = 'Cover';
|
||||
case Uncover = 'Uncover';
|
||||
case Fade = 'Fade';
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
enum TrappedNameValue: string implements NameValue {
|
||||
case TRUE = 'True';
|
||||
case FALSE = 'False';
|
||||
case UNKNOWN = 'Unknown';
|
||||
}
|
||||
@ -0,0 +1,155 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name;
|
||||
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Catalog;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\DecoratedObject;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\EmbeddedFile;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\FileSpecification;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Font;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\GenericObject;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Page;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\Pages;
|
||||
use PrinsFrank\PdfParser\Document\Object\Decorator\XObject;
|
||||
|
||||
enum TypeNameValue: string implements NameValue {
|
||||
case _3_D = '3D';
|
||||
case _3_D_ANIMATION_STYLE = '3DAnimationStyle';
|
||||
case _3_D_B_G = '3DBG';
|
||||
case _3_D_CROSS_SECTION = '3DCrossSection';
|
||||
case _3_D_LIGHTING_SCHEME = '3DLightingScheme';
|
||||
case _3_D_MEASURE = '3DMeasure';
|
||||
case _3_D_NODE = '3DNode';
|
||||
case _3_D_REF = '3DRef';
|
||||
case _3_D_RENDER_MODE = '3DRenderMode';
|
||||
case _3_D_VIEW = '3DView';
|
||||
case ACTION = 'Action';
|
||||
case ANNOT = 'Annot';
|
||||
case BACKGROUND = 'Background';
|
||||
case BEAD = 'Bead';
|
||||
case BORDER = 'Border';
|
||||
case C_I_D_FONT = 'CIDFont';
|
||||
case C_MAP = 'CMap';
|
||||
case CATALOG = 'Catalog';
|
||||
case COLLECTION = 'Collection';
|
||||
case COLLECTION_COLORS = 'CollectionColors';
|
||||
case COLLECTION_FIELD = 'CollectionField';
|
||||
case COLLECTION_ITEM = 'CollectionItem';
|
||||
case COLLECTION_SCHEMA = 'CollectionSchema';
|
||||
case COLLECTION_SORT = 'CollectionSort';
|
||||
case COLLECTION_SPLIT = 'CollectionSplit';
|
||||
case COLLECTION_SUB_ITEM = 'CollectionSubItem';
|
||||
case CRYPT = 'Crypt';
|
||||
case CRYPT_ALGORITHM = 'CryptAlgorithm';
|
||||
case CRYPT_FILTER = 'CryptFilter';
|
||||
case CRYPT_FILTER_DECODE_PARMS = 'CryptFilterDecodeParms';
|
||||
case D_PART = 'DPart';
|
||||
case D_PART_ROOT = 'DPartRoot';
|
||||
case DEVELOPER_EXTENSIONS = 'DeveloperExtensions';
|
||||
case DOC_TIME_STAMP = 'DocTimeStamp';
|
||||
case DSS = 'DSS';
|
||||
case EMBEDDED_FILE = 'EmbeddedFile';
|
||||
case ENCODING = 'Encoding';
|
||||
case ENCRYPTED_PAYLOAD = 'EncryptedPayload';
|
||||
case EX_DATA = 'ExData';
|
||||
case EXT_G_STATE = 'ExtGState';
|
||||
case EXTENSIONS = 'Extensions';
|
||||
case F_W_PARAMS = 'FWParams';
|
||||
case FILE_SPEC = 'Filespec';
|
||||
case FILL_SIGN_DATA = 'FillSignData';
|
||||
case FIXED_PRINT = 'FixedPrint';
|
||||
case FOLDER = 'Folder';
|
||||
case FONT = 'Font';
|
||||
case FONT_DESCRIPTOR = 'FontDescriptor';
|
||||
case GEO_G_C_S = 'GEOGCS';
|
||||
case GROUP = 'Group';
|
||||
case HALF_TONE = 'Halftone';
|
||||
case INLINE = 'Inline';
|
||||
case LAYOUT = 'Layout';
|
||||
case M_C_R = 'MCR';
|
||||
case MARK_INFO = 'MarkInfo';
|
||||
case MASK = 'Mask';
|
||||
case MEASURE = 'Measure';
|
||||
case MEDIA_CLIP = 'MediaClip';
|
||||
case MEDIA_CRITERIA = 'MediaCriteria';
|
||||
case MEDIA_DURATION = 'MediaDuration';
|
||||
case MEDIA_OFFSET = 'MediaOffset';
|
||||
case MEDIA_PERMISSIONS = 'MediaPermissions';
|
||||
case MEDIA_PLAY_PARAMS = 'MediaPlayParams';
|
||||
case MEDIA_PLAYER_INFO = 'MediaPlayerInfo';
|
||||
case MEDIA_PLAYERS = 'MediaPlayers';
|
||||
case MEDIA_SCREEN_PARAMS = 'MediaScreenParams';
|
||||
case METADATA = 'Metadata';
|
||||
case MIN_BIT_DEPTH = 'MinBitDepth';
|
||||
case MIN_SCREEN_SIZE = 'MinScreenSize';
|
||||
case NAMESPACE = 'Namespace';
|
||||
case NAV_NODE = 'NavNode';
|
||||
case NAVIGATOR = 'Navigator';
|
||||
case NUMBER_FORMAT = 'NumberFormat';
|
||||
case O_B_J_R = 'OBJR';
|
||||
case O_C_G = 'OCG';
|
||||
case O_C_M_D = 'OCMD';
|
||||
case O_P_I = 'OPI';
|
||||
case OBJ_STM = 'ObjStm';
|
||||
case OUTLINES = 'Outlines';
|
||||
case OUTPUT_INTENT = 'OutputIntent';
|
||||
case PAGE = 'Page';
|
||||
case PAGE_LABEL = 'PageLabel';
|
||||
case PAGES = 'Pages';
|
||||
case PAGINATION = 'Pagination';
|
||||
case PATTERN = 'Pattern';
|
||||
case PROJ_C_S = 'PROJCS';
|
||||
case PT_DATA = 'PtData';
|
||||
case RENDITION = 'Rendition';
|
||||
case RESOURCE = 'Resource';
|
||||
case REQ_HANDLER = 'ReqHandler';
|
||||
case REQUIREMENT = 'Requirement';
|
||||
case RICH_MEDIA_ACTIVATION = 'RichMediaActivation';
|
||||
case RICH_MEDIA_ANIMATION = 'RichMediaAnimation';
|
||||
case RICH_MEDIA_COMMAND = 'RichMediaCommand';
|
||||
case RICH_MEDIA_CONFIGURATION = 'RichMediaConfiguration';
|
||||
case RICH_MEDIA_CONTENT = 'RichMediaContent';
|
||||
case RICH_MEDIA_DEACTIVATION = 'RichMediaDeactivation';
|
||||
case RICH_MEDIA_INSTANCE = 'RichMediaInstance';
|
||||
case RICH_MEDIA_POSITION = 'RichMediaPosition';
|
||||
case RICH_MEDIA_PRESENTATION = 'RichMediaPresentation';
|
||||
case RICH_MEDIA_SETTINGS = 'RichMediaSettings';
|
||||
case RICH_MEDIA_WINDOW = 'RichMediaWindow';
|
||||
case S_V = 'SV';
|
||||
case S_V_CERT = 'SVCert';
|
||||
case SIG = 'Sig';
|
||||
case SIG_FIELD_LOCK = 'SigFieldLock';
|
||||
case SIG_REF = 'SigRef';
|
||||
case SLIDESHOW = 'Slideshow';
|
||||
case SOFTWARE_IDENTIFIER = 'SoftwareIdentifier';
|
||||
case SOUND = 'Sound';
|
||||
case SPIDER_CONTENT_SET = 'SpiderContentSet';
|
||||
case STREAM = 'Stream';
|
||||
case STRUCT_ELEM = 'StructElem';
|
||||
case STRUCT_TREE_ROOT = 'StructTreeRoot';
|
||||
case TEMPLATE = 'Template';
|
||||
case THREAD = 'Thread';
|
||||
case TIMESPAN = 'Timespan';
|
||||
case TRANS = 'Trans';
|
||||
case TRANSFORM_PARAMS = 'TransformParams';
|
||||
case VIEWER_PREFERENCES = 'ViewerPreferences';
|
||||
case VIEWPORT = 'Viewport';
|
||||
case VRI = 'VRI';
|
||||
case X_OBJECT = 'XObject';
|
||||
case X_REF = 'XRef';
|
||||
|
||||
/** @return class-string<DecoratedObject> */
|
||||
public function getDecoratorFQN(): string {
|
||||
return match($this) {
|
||||
TypeNameValue::CATALOG => Catalog::class,
|
||||
TypeNameValue::EMBEDDED_FILE => EmbeddedFile::class,
|
||||
TypeNameValue::FILE_SPEC => FileSpecification::class,
|
||||
TypeNameValue::FONT => Font::class,
|
||||
TypeNameValue::PAGE => Page::class,
|
||||
TypeNameValue::PAGES => Pages::class,
|
||||
TypeNameValue::X_OBJECT => XObject::class,
|
||||
default => GenericObject::class,
|
||||
};
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user