2026-01-09 07:13:59 +00:00

102 lines
5.3 KiB
PHP

<?php
declare(strict_types=1);
namespace PrinsFrank\PdfParser\Document\ContentStream;
use PrinsFrank\PdfParser\Document\ContentStream\Command\ContentStreamCommand;
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\GraphicsStateOperator;
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTransformationMatrix;
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\InteractsWithTextState;
use PrinsFrank\PdfParser\Document\ContentStream\Command\Operator\State\Interaction\ProducesPositionedTextElements;
use PrinsFrank\PdfParser\Document\ContentStream\Object\TextObject;
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\PositionedTextElement;
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;
use PrinsFrank\PdfParser\Document\Document;
use PrinsFrank\PdfParser\Document\Object\Decorator\Page;
use PrinsFrank\PdfParser\Exception\ParseFailureException;
use PrinsFrank\PdfParser\Exception\PdfParserException;
/** @api */
class ContentStream {
/** @var list<TextObject|ContentStreamCommand> */
public readonly array $content;
/** @no-named-arguments */
public function __construct(
TextObject|ContentStreamCommand... $content
) {
$this->content = $content;
}
/** @return list<PositionedTextElement> */
public function getPositionedTextElements(): array {
$positionedTextElements = $transformationStateStack = [];
$textState = null; // See table 103, Tf operator for initial value
$transformationMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix
foreach ($this->content as $content) {
if ($content instanceof ContentStreamCommand) {
if ($content->operator instanceof InteractsWithTextState) {
$textState = $content->operator->applyToTextState($content->operands, $textState);
} elseif ($content->operator === GraphicsStateOperator::SaveCurrentStateToStack) {
$transformationStateStack[] = clone $transformationMatrix;
} elseif ($content->operator === GraphicsStateOperator::RestoreMostRecentStateFromStack) {
$transformationMatrix = array_pop($transformationStateStack)
?? throw new ParseFailureException();
} elseif ($content->operator instanceof InteractsWithTransformationMatrix) {
$transformationMatrix = $content->operator->applyToTransformationMatrix($content->operands, $transformationMatrix);
}
continue;
}
$textMatrix = new TransformationMatrix(1, 0, 0, 1, 0, 0); // Identity matrix, See Table 106, Tm operator for initial value in text object
foreach ($content->contentStreamCommands as $contentStreamCommand) {
if ($contentStreamCommand->operator instanceof InteractsWithTextState) {
$textState = $contentStreamCommand->operator->applyToTextState($contentStreamCommand->operands, $textState);
}
if ($contentStreamCommand->operator instanceof InteractsWithTransformationMatrix) {
$textMatrix = $contentStreamCommand->operator->applyToTransformationMatrix($contentStreamCommand->operands, $textMatrix);
}
if ($contentStreamCommand->operator instanceof ProducesPositionedTextElements && $textState !== null) {
$positionedTextElements[] = $contentStreamCommand->operator->getPositionedTextElement($contentStreamCommand->operands, $textMatrix, $transformationMatrix, $textState);
}
}
}
usort(
$positionedTextElements,
static function (PositionedTextElement $a, PositionedTextElement $b): int {
if (($differenceY = $b->absoluteMatrix->offsetY <=> $a->absoluteMatrix->offsetY) !== 0) {
return $differenceY;
}
return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX;
}
);
return $positionedTextElements;
}
/** @throws PdfParserException */
public function getText(Document $document, Page $page): string {
$text = '';
$previousPositionedTextElement = null;
foreach ($this->getPositionedTextElements() as $positionedTextElement) {
if ($previousPositionedTextElement !== null) {
if ($previousPositionedTextElement->absoluteMatrix->offsetY !== $positionedTextElement->absoluteMatrix->offsetY) {
$text .= "\n";
} elseif (($positionedTextElement->absoluteMatrix->offsetX - $previousPositionedTextElement->absoluteMatrix->offsetX - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) {
$text .= ' ';
}
}
$text .= $positionedTextElement->getText($document, $page);
$previousPositionedTextElement = $positionedTextElement;
}
return $text;
}
}