218 lines
12 KiB
PHP
218 lines
12 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
namespace PrinsFrank\PdfParser\Document\ContentStream;
|
|
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\ContentStreamCommand;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\CompatibilityOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\InlineImageOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\MarkedContentOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\Object\TextObjectOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ClippingPathOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\ColorOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\GraphicsStateOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathConstructionOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\PathPaintingOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextPositioningOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextShowingOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\TextStateOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Type3FontOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\XObjectOperator;
|
|
use PrinsFrank\PdfParser\Document\ContentStream\Object\TextObject;
|
|
use PrinsFrank\PdfParser\Document\Object\Decorator\DecoratedObject;
|
|
use PrinsFrank\PdfParser\Exception\ParseFailureException;
|
|
|
|
/** @internal */
|
|
class ContentStreamParser {
|
|
/**
|
|
* @param list<DecoratedObject> $contentsObjects
|
|
* @throws ParseFailureException
|
|
*/
|
|
public static function parse(array $contentsObjects): ContentStream {
|
|
$content = [];
|
|
$inStringLiteral = $inResourceName = $inDictionary = false;
|
|
$inArrayLevel = $inStringLevel = 0;
|
|
$textObject = $previousChar = $secondToLastChar = $thirdToLastChar = $previousContentStream = $startPreviousOperandIndex = null;
|
|
foreach ($contentsObjects as $contentsObject) {
|
|
$startCurrentOperandIndex = 0;
|
|
$contentStream = $contentsObject->getStream();
|
|
$contentStreamSize = $contentStream->getSizeInBytes();
|
|
for ($index = 0; $index < $contentStreamSize; $index++) {
|
|
$char = $contentStream->read($index, 1);
|
|
if ($inStringLiteral === true) {
|
|
if ($char === ')' && $previousChar !== '\\') {
|
|
$inStringLiteral = false;
|
|
}
|
|
} elseif ($inResourceName === true) {
|
|
if (in_array($char, [' ', '<', '(', '/', "\r", "\n"], true) && $previousChar !== '\\') {
|
|
$inResourceName = false;
|
|
}
|
|
} elseif ($inDictionary === true) {
|
|
if ($char === '>' && $previousChar === '>' && $secondToLastChar !== '\\') {
|
|
$inDictionary = false;
|
|
}
|
|
} elseif ($char === '[' && $previousChar !== '\\') {
|
|
$inArrayLevel++;
|
|
} elseif ($char === '<' && $previousChar === '<' && $secondToLastChar !== '\\') {
|
|
$inDictionary = true;
|
|
} elseif ($char === '<' && $previousChar !== '\\' && $contentStream->read($index + 1, 1) !== '<') {
|
|
$inStringLevel++;
|
|
} elseif ($char === '(' && $previousChar !== '\\') {
|
|
$inStringLiteral = true;
|
|
} elseif ($char === '/' && $previousChar !== '\\') {
|
|
$inResourceName = true;
|
|
} elseif ($inStringLevel > 0 || $inArrayLevel > 0) {
|
|
if ($inStringLevel > 0 && $char === '>' && $previousChar !== '\\') {
|
|
$inStringLevel--;
|
|
} elseif ($inArrayLevel > 0 && $char === ']' && $previousChar !== '\\') {
|
|
$inArrayLevel--;
|
|
}
|
|
} elseif ($char === 'T' && $previousChar === 'B') { // TextObjectOperator::BEGIN
|
|
$startCurrentOperandIndex = $index + 1;
|
|
$textObject = new TextObject();
|
|
} elseif ($char === 'T' && $previousChar === 'E') { // TextObjectOperator::END
|
|
$startCurrentOperandIndex = $index + 1;
|
|
if ($textObject === null) {
|
|
throw new ParseFailureException('Encountered TextObjectOperator::END without preceding TextObjectOperator::BEGIN');
|
|
}
|
|
|
|
$content[] = $textObject;
|
|
$textObject = null;
|
|
} elseif ($char === 'C'
|
|
&& (($secondToLastChar === 'B' && ($previousChar === 'M' || $previousChar === 'D')) || ($secondToLastChar === 'E' && $previousChar === 'M'))) { // MarkedContentOperator::BeginMarkedContent, MarkedContentOperator::EndMarkedContent, MarkedContentOperator::BeginMarkedContentWithProperties
|
|
$startCurrentOperandIndex = $index + 1;
|
|
} elseif (($operator = self::getOperator($char, $previousChar, $secondToLastChar, $thirdToLastChar)) !== null
|
|
&& (($nextChar = $contentStream->read($index + 1, 1)) === '' || self::getOperator($nextChar, $char, $previousChar, $secondToLastChar) === null)) { // Skip the current hit if the next iteration is also a valid operator
|
|
$operands = '';
|
|
if ($previousContentStream !== null && $startPreviousOperandIndex !== null && $startPreviousOperandIndex < $previousContentStream->getSizeInBytes()) {
|
|
$operands .= $previousContentStream->read($startPreviousOperandIndex, $previousContentStream->getSizeInBytes() - $startPreviousOperandIndex);
|
|
$startPreviousOperandIndex = null;
|
|
}
|
|
if (($operandLength = $index + 1 - $startCurrentOperandIndex - strlen($operator->value)) > 0) {
|
|
$operands .= $contentStream->read($startCurrentOperandIndex, $operandLength);
|
|
}
|
|
|
|
$command = new ContentStreamCommand($operator, trim($operands));
|
|
if ($textObject !== null) {
|
|
$textObject->addContentStreamCommand($command);
|
|
} else {
|
|
$content[] = $command;
|
|
}
|
|
|
|
$startCurrentOperandIndex = $index + 1;
|
|
}
|
|
|
|
$thirdToLastChar = $secondToLastChar;
|
|
$secondToLastChar = $previousChar;
|
|
$previousChar = $char;
|
|
}
|
|
|
|
$previousContentStream = $contentStream;
|
|
$startPreviousOperandIndex = $startCurrentOperandIndex;
|
|
}
|
|
|
|
return new ContentStream(...$content);
|
|
}
|
|
|
|
/**
|
|
* This method uses three maps instead of calling $enum::tryFrom for all possible enums
|
|
* as operator retrieval happens possibly millions of times in a single file
|
|
*/
|
|
public static function getOperator(string $currentChar, ?string $previousChar, ?string $secondToLastChar, ?string $thirdToLastChar): CompatibilityOperator|InlineImageOperator|MarkedContentOperator|TextObjectOperator|ClippingPathOperator|ColorOperator|GraphicsStateOperator|PathConstructionOperator|PathPaintingOperator|TextPositioningOperator|TextShowingOperator|TextStateOperator|Type3FontOperator|XObjectOperator|null {
|
|
$threeLetterMatch = match ($secondToLastChar . $previousChar . $currentChar) {
|
|
'BMC' => MarkedContentOperator::BeginMarkedContent,
|
|
'BDC' => MarkedContentOperator::BeginMarkedContentWithProperties,
|
|
'EMC' => MarkedContentOperator::EndMarkedContent,
|
|
'SCN' => ColorOperator::SetStrokingParams,
|
|
'scn' => ColorOperator::SetColorParams,
|
|
default => null,
|
|
};
|
|
if ($threeLetterMatch !== null) {
|
|
return in_array($thirdToLastChar, ['\\', '/'], true) ? null : $threeLetterMatch;
|
|
}
|
|
|
|
$twoLetterMatch = match ($previousChar . $currentChar) {
|
|
'BX' => CompatibilityOperator::BeginCompatibilitySection,
|
|
'EX' => CompatibilityOperator::EndCompatibilitySection,
|
|
'BI' => InlineImageOperator::Begin,
|
|
'ID' => InlineImageOperator::BeginImageData,
|
|
'EI' => InlineImageOperator::End,
|
|
'MD' => MarkedContentOperator::Tag,
|
|
'DP' => MarkedContentOperator::TagProperties,
|
|
'BT' => TextObjectOperator::BEGIN,
|
|
'ET' => TextObjectOperator::END,
|
|
'W*' => ClippingPathOperator::INTERSECT_EVEN_ODD,
|
|
'CS' => ColorOperator::SetName,
|
|
'cs' => ColorOperator::SetNameNonStroking,
|
|
'SC' => ColorOperator::SetStrokingColor,
|
|
'sc' => ColorOperator::SetColor,
|
|
'RG' => ColorOperator::SetStrokingColorDeviceRGB,
|
|
'rg' => ColorOperator::SetColorDeviceRGB,
|
|
'cm' => GraphicsStateOperator::ModifyCurrentTransformationMatrix,
|
|
'ri' => GraphicsStateOperator::SetIntent,
|
|
'gs' => GraphicsStateOperator::SetDictName,
|
|
're' => PathConstructionOperator::RECTANGLE,
|
|
'f*' => PathPaintingOperator::FILL_EVEN_ODD,
|
|
'B*' => PathPaintingOperator::FILL_STROKE_EVEN_ODD,
|
|
'b*' => PathPaintingOperator::CLOSE_FILL_STROKE,
|
|
'Td' => TextPositioningOperator::MOVE_OFFSET,
|
|
'TD' => TextPositioningOperator::MOVE_OFFSET_LEADING,
|
|
'Tm' => TextPositioningOperator::SET_MATRIX,
|
|
'T*' => TextPositioningOperator::NEXT_LINE,
|
|
'Tj' => TextShowingOperator::SHOW,
|
|
'TJ' => TextShowingOperator::SHOW_ARRAY,
|
|
'Tc' => TextStateOperator::CHAR_SPACE,
|
|
'Tw' => TextStateOperator::WORD_SPACE,
|
|
'Tz' => TextStateOperator::SCALE,
|
|
'TL' => TextStateOperator::LEADING,
|
|
'Tf' => TextStateOperator::FONT_SIZE,
|
|
'Tr' => TextStateOperator::RENDER,
|
|
'Ts' => TextStateOperator::RISE,
|
|
'd0' => Type3FontOperator::SetWidth,
|
|
'd1' => Type3FontOperator::SetWidthAndBoundingBox,
|
|
'Do' => XObjectOperator::Paint,
|
|
default => null,
|
|
};
|
|
if ($twoLetterMatch !== null) {
|
|
return in_array($secondToLastChar, ['\\', '/'], true) ? null : $twoLetterMatch;
|
|
}
|
|
|
|
$oneLetterMatch = match ($currentChar) {
|
|
'W' => ClippingPathOperator::INTERSECT,
|
|
'G' => ColorOperator::SetStrokingColorSpace,
|
|
'g' => ColorOperator::SetColorSpace,
|
|
'K' => ColorOperator::SetStrokingColorDeviceCMYK,
|
|
'k' => ColorOperator::SetColorDeviceCMYK,
|
|
'q' => GraphicsStateOperator::SaveCurrentStateToStack,
|
|
'Q' => GraphicsStateOperator::RestoreMostRecentStateFromStack,
|
|
'w' => GraphicsStateOperator::SetLineWidth,
|
|
'J' => GraphicsStateOperator::SetLineCap,
|
|
'j' => GraphicsStateOperator::SetLineJoin,
|
|
'M' => GraphicsStateOperator::SetMiterJoin,
|
|
'd' => GraphicsStateOperator::SetLineDash,
|
|
'i' => GraphicsStateOperator::SetFlatness,
|
|
'm' => PathConstructionOperator::MOVE,
|
|
'l' => PathConstructionOperator::LINE,
|
|
'c' => PathConstructionOperator::CURVE_BEZIER_123,
|
|
'v' => PathConstructionOperator::CURVE_BEZIER_23,
|
|
'y' => PathConstructionOperator::CURVE_BEZIER_13,
|
|
'h' => PathConstructionOperator::CLOSE,
|
|
'S' => PathPaintingOperator::STROKE,
|
|
's' => PathPaintingOperator::CLOSE_STROKE,
|
|
'f' => PathPaintingOperator::FILL,
|
|
'F' => PathPaintingOperator::FILL_DEPRECATED,
|
|
'B' => PathPaintingOperator::FILL_STROKE,
|
|
'n' => PathPaintingOperator::END,
|
|
'\'' => TextShowingOperator::MOVE_SHOW,
|
|
'"' => TextShowingOperator::MOVE_SHOW_SPACING,
|
|
default => null,
|
|
};
|
|
|
|
if ($oneLetterMatch !== null) {
|
|
return in_array($previousChar, ['\\', '/'], true) ? null : $oneLetterMatch;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|