*/ public readonly array $content; /** @no-named-arguments */ public function __construct( TextObject|ContentStreamCommand... $content ) { $this->content = $content; } /** @return list */ public function getPositionedTextElements(): array { $positionedTextElements = $transformationStateStack = []; $textState = null; // See table 103, Tf operator for initial value $transformationMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix foreach ($this->content as $content) { if ($content instanceof ContentStreamCommand) { if ($content->operator instanceof InteractsWithTextState) { $textState = $content->operator->applyToTextState($content->operands, $textState); } elseif ($content->operator === GraphicsStateOperator::SaveCurrentStateToStack) { $transformationStateStack[] = clone $transformationMatrix; } elseif ($content->operator === GraphicsStateOperator::RestoreMostRecentStateFromStack) { $transformationMatrix = array_pop($transformationStateStack) ?? throw new ParseFailureException(); } elseif ($content->operator instanceof InteractsWithTransformationMatrix) { $transformationMatrix = $content->operator->applyToTransformationMatrix($content->operands, $transformationMatrix); } continue; } $textMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix, See Table 106, Tm operator for initial value in text object foreach ($content->contentStreamCommands as $contentStreamCommand) { if ($contentStreamCommand->operator instanceof InteractsWithTextState) { $textState = $contentStreamCommand->operator->applyToTextState($contentStreamCommand->operands, $textState); } if ($contentStreamCommand->operator instanceof InteractsWithTransformationMatrix) { $textMatrix = $contentStreamCommand->operator->applyToTransformationMatrix($contentStreamCommand->operands, $textMatrix); } if ($contentStreamCommand->operator instanceof ProducesPositionedTextElements && $textState !== null) { $positionedTextElements[] = $contentStreamCommand->operator->getPositionedTextElement($contentStreamCommand->operands, $textMatrix, $transformationMatrix, $textState); } } } usort( $positionedTextElements, static function (PositionedTextElement $a, PositionedTextElement $b): int { if (($differenceY = $b->absoluteMatrix->offsetY <=> $a->absoluteMatrix->offsetY) !== 0) { return $differenceY; } return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX; } ); return $positionedTextElements; } /** @throws PdfParserException */ public function getText(Document $document, Page $page): string { $text = ''; $previousPositionedTextElement = null; foreach ($this->getPositionedTextElements() as $positionedTextElement) { if ($previousPositionedTextElement !== null) { if ($previousPositionedTextElement->absoluteMatrix->offsetY !== $positionedTextElement->absoluteMatrix->offsetY) { $text .= "\n"; } elseif (($positionedTextElement->absoluteMatrix->offsetX - $previousPositionedTextElement->absoluteMatrix->offsetX - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) { $text .= ' '; } } $text .= $positionedTextElement->getText($document, $page); $previousPositionedTextElement = $positionedTextElement; } return $text; } }