Source code for pdfbeaver.state_tracker

# src/pdfbeaver/state_tracker.py
"""Tracks the PDF Text State parameters as defined in the PDF 1.7 Reference.

Attributes:
    char_spacing (float): Character spacing ($Tc$). Defaults to 0.0.
    word_spacing (float): Word spacing ($Tw$). Defaults to 0.0.
    horiz_scaling (float): Horizontal scaling ($Tz$). Defaults to 100.0.
    leading (float): Text leading ($Tl$). Defaults to 0.0.
    font_name (Optional[str]): The resource name of the current font (e.g. '/F1'),
        if available.
    fontsize (float): The current font size ($Tfs$).
    render_mode (int): The text rendering mode ($Tr$).
    rise (float): Text rise ($Ts$). Defaults to 0.0.
    knockout (bool): Text knockout flag. Defaults to True.
    matrix (List[float]): The Text Matrix ($Tm$), stored as a 6-element list
        ``[a, b, c, d, e, f]``.
    line_matrix (List[float]): The Text Line Matrix ($Tlm$), stored as a
        6-element list.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple

import numpy as np

from .utils.pdf_conversion import miner_matrix_to_np

logger = logging.getLogger(__name__)


@dataclass
class TextState:  # pylint: disable=too-many-instance-attributes
    """Tracks the PDF Text State parameters.

    Attributes:
        char_spacing (float): Character spacing ($Tc$). Defaults to 0.0.
        word_spacing (float): Word spacing ($Tw$). Defaults to 0.0.
        horiz_scaling (float): Horizontal scaling ($Tz$). Defaults to 100.0.
        leading (float): Text leading ($Tl$). Defaults to 0.0.
        font_name (Optional[str]): The PostScript name of the current font
            (e.g., 'Helvetica-Bold'), if available. Note: This is *not* the
            Resource Name (e.g., '/F1').
        fontsize (float): The current font size ($Tfs$).
        render_mode (int): The text rendering mode ($Tr$).
        rise (float): Text rise ($Ts$). Defaults to 0.0.
        knockout (bool): Text knockout flag. Defaults to True.
        matrix (List[float]): The Text Matrix ($Tm$), stored as a 6-element list
            ``[a, b, c, d, e, f]``.
        line_matrix (List[float]): The Text Line Matrix ($Tlm$), stored as a
            6-element list.
    """

    char_spacing: float = 0.0
    word_spacing: float = 0.0
    horiz_scaling: float = 100.0
    leading: float = 0.0
    font_name: Optional[str] = None
    fontsize: float = 0.0
    render_mode: int = 0
    rise: float = 0.0
    knockout: bool = True

    # Matrices are stored as list of 6 floats [a, b, c, d, e, f]
    matrix: List[float] = field(default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0])
    line_matrix: List[float] = field(
        default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
    )

    def copy(self):
        """Return a copy of this state"""
        new_obj = TextState(**self.__dict__)
        new_obj.matrix = list(self.matrix)
        new_obj.line_matrix = list(self.line_matrix)
        return new_obj


@dataclass
class GraphicsState:
    """Tracks the PDF Graphics State parameters.

    Attributes:
        ctm (List[float]): The Current Transformation Matrix ($CTM$), stored as
            a 6-element list ``[a, b, c, d, e, f]``.
    """

    ctm: np.ndarray = field(default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0])

    def copy(self):
        """Return a copy of this state"""
        new_obj = GraphicsState(**self.__dict__)
        new_obj.ctm = list(self.ctm)
        return new_obj


[docs] class StateTracker: """ State tracker. Tracks the CTM (Graphics) and Text Matrices. This tracker acts as a bridge between the underlying ``pdfminer`` state machine and the ``pdfbeaver`` context. It ingests snapshots of the state provided by the iterator and makes them accessible in a clean, pythonic format. """ def __init__(self): self.gstate = GraphicsState() self.gstack: List[GraphicsState] = [] self.textstate = TextState() self.text_obj_active = False
[docs] def get_matrices(self) -> Tuple[np.ndarray, np.ndarray]: """ Calculates the effective transformation matrices. Returns: Tuple[np.ndarray, np.ndarray]: A tuple containing: 1. The CTM (3x3 numpy array). 2. The Text Render Matrix (CTM x TM) (3x3 numpy array). """ ctm = miner_matrix_to_np(self.gstate.ctm) tm = miner_matrix_to_np(self.textstate.matrix) trm = tm @ ctm return ctm, trm
[docs] def set_state(self, state: Dict[str, Any]): """ Updates the internal state to match the snapshot provided by the iterator. This is the 'Passive Tracking' model: we trust the engine (pdfminer). """ if not state: return # Sync CTM (pdfminer provides tuple/list of 6 floats) if "ctm" in state: self.gstate.ctm = list(state["ctm"]) # Sync Text State # Note: pdfminer returns a PDFTextState object in 'tstate' # We map specific fields we care about to our internal structure if "tstate" in state: src = state["tstate"] dst = self.textstate dst.char_spacing = src.charspace dst.word_spacing = src.wordspace dst.horiz_scaling = src.scaling dst.leading = src.leading dst.font_name = getattr(src.font, "basefont", None) if src.font else None dst.fontsize = src.fontsize dst.render_mode = src.render dst.rise = src.rise dst.matrix = src.matrix dst.line_matrix = src.linematrix
[docs] def get_snapshot(self) -> Dict[str, Any]: """Returns a snapshot of the current state.""" return { "ctm": self.gstate.ctm, "tstate": self.textstate.copy(), "gstate": self.gstate.copy(), "font_name": self.textstate.font_name, }
[docs] def get_current_user_pos(self) -> np.ndarray: """ Returns the (x, y) position of the cursor in User Space. Calculated as: Origin(0,0) x Tm x CTM. Returns: np.ndarray: A 3-element vector [x, y, 1] representing the cursor position. """ # Start of text space (0, 0) p = np.array([0.0, 0.0, 1.0]) # Apply Text Matrix a, b, c, d, e, f = self.textstate.matrix tm = np.array([[a, b, 0], [c, d, 0], [e, f, 1]]) # Apply CTM ctm = miner_matrix_to_np(self.gstate.ctm) return p @ tm @ ctm