Source code for docx.text.pagebreak

"""Proxy objects related to rendered page-breaks."""

from __future__ import annotations

from typing import TYPE_CHECKING

from docx import types as t
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.shared import Parented

if TYPE_CHECKING:
    from docx.text.paragraph import Paragraph


[docs]class RenderedPageBreak(Parented):
    """A page-break inserted by Word during page-layout for print or display purposes.

    This usually does not correspond to a "hard" page-break inserted by the document
    author, rather just that Word ran out of room on one page and needed to start
    another. The position of these can change depending on the printer and page-size, as
    well as margins, etc. They also will change in response to edits, but not until Word
    loads and saves the document.

    Note these are never inserted by `python-docx` because it has no rendering function.
    These are generally only useful for text-extraction of existing documents when
    `python-docx` is being used solely as a document "reader".

    NOTE: a rendered page-break can occur within a hyperlink; consider a multi-word
    hyperlink like "excellent Wikipedia article on LLMs" that happens to fall close to
    the end of the last line on a page such that the page breaks between "Wikipedia" and
    "article". In such a "page-breaks-in-hyperlink" case, THESE METHODS WILL "MOVE" THE
    PAGE-BREAK to occur after the hyperlink, such that the entire hyperlink appears in
    the paragraph returned by `.preceding_paragraph_fragment`. While this places the
    "tail" text of the hyperlink on the "wrong" page, it avoids having two hyperlinks
    each with a fragment of the actual text and pointing to the same address.
    """

    def __init__(
        self,
        lastRenderedPageBreak: CT_LastRenderedPageBreak,
        parent: t.ProvidesStoryPart,
    ):
        super().__init__(parent)
        self._element = lastRenderedPageBreak
        self._lastRenderedPageBreak = lastRenderedPageBreak

    @property
    def preceding_paragraph_fragment(self) -> Paragraph | None:
        """A "loose" paragraph containing the content preceding this page-break.

        Compare `.following_paragraph_fragment` as these two are intended to be used
        together.

        This value is `None` when no content precedes this page-break. This case is
        common and occurs whenever a page breaks on an even paragraph boundary.
        Returning `None` for this case avoids "inserting" a non-existent paragraph into
        the content stream. Note that content can include DrawingML items like images or
        charts.

        Note the returned paragraph *is divorced from the document body*. Any changes
        made to it will not be reflected in the document. It is intended to provide a
        familiar container (`Paragraph`) to interrogate for the content preceding this
        page-break in the paragraph in which it occured.

        Contains the entire hyperlink when this break occurs within a hyperlink.
        """
        if self._lastRenderedPageBreak.precedes_all_content:
            return None

        from docx.text.paragraph import Paragraph

        return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)

    @property
    def following_paragraph_fragment(self) -> Paragraph | None:
        """A "loose" paragraph containing the content following this page-break.

        HAS POTENTIALLY SURPRISING BEHAVIORS so read carefully to be sure this is what
        you want. This is primarily targeted toward text-extraction use-cases for which
        precisely associating text with the page it occurs on is important.

        Compare `.preceding_paragraph_fragment` as these two are intended to be used
        together.

        This value is `None` when no content follows this page-break. This case is
        unlikely to occur in practice because Word places even-paragraph-boundary
        page-breaks on the paragraph *following* the page-break. Still, it is possible
        and must be checked for. Returning `None` for this case avoids "inserting" an
        extra, non-existent paragraph into the content stream. Note that content can
        include DrawingML items like images or charts, not just text.

        The returned paragraph *is divorced from the document body*. Any changes made to
        it will not be reflected in the document. It is intended to provide a container
        (`Paragraph`) with familiar properties and methods that can be used to
        characterize the paragraph content following a mid-paragraph page-break.

        Contains no portion of the hyperlink when this break occurs within a hyperlink.
        """
        if self._lastRenderedPageBreak.follows_all_content:
            return None

        from docx.text.paragraph import Paragraph

        return Paragraph(self._lastRenderedPageBreak.following_fragment_p, self._parent)