Source code for docx.text.pagebreak
"""Proxy objects related to rendered page-breaks."""
from __future__ import annotations
from typing import TYPE_CHECKING
from docx import types as t
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.shared import Parented
if TYPE_CHECKING:
from docx.text.paragraph import Paragraph
[docs]class RenderedPageBreak(Parented):
"""A page-break inserted by Word during page-layout for print or display purposes.
This usually does not correspond to a "hard" page-break inserted by the document
author, rather just that Word ran out of room on one page and needed to start
another. The position of these can change depending on the printer and page-size, as
well as margins, etc. They also will change in response to edits, but not until Word
loads and saves the document.
Note these are never inserted by `python-docx` because it has no rendering function.
These are generally only useful for text-extraction of existing documents when
`python-docx` is being used solely as a document "reader".
NOTE: a rendered page-break can occur within a hyperlink; consider a multi-word
hyperlink like "excellent Wikipedia article on LLMs" that happens to fall close to
the end of the last line on a page such that the page breaks between "Wikipedia" and
"article". In such a "page-breaks-in-hyperlink" case, THESE METHODS WILL "MOVE" THE
PAGE-BREAK to occur after the hyperlink, such that the entire hyperlink appears in
the paragraph returned by `.preceding_paragraph_fragment`. While this places the
"tail" text of the hyperlink on the "wrong" page, it avoids having two hyperlinks
each with a fragment of the actual text and pointing to the same address.
"""
def __init__(
self,
lastRenderedPageBreak: CT_LastRenderedPageBreak,
parent: t.ProvidesStoryPart,
):
super().__init__(parent)
self._element = lastRenderedPageBreak
self._lastRenderedPageBreak = lastRenderedPageBreak
@property
def preceding_paragraph_fragment(self) -> Paragraph | None:
"""A "loose" paragraph containing the content preceding this page-break.
Compare `.following_paragraph_fragment` as these two are intended to be used
together.
This value is `None` when no content precedes this page-break. This case is
common and occurs whenever a page breaks on an even paragraph boundary.
Returning `None` for this case avoids "inserting" a non-existent paragraph into
the content stream. Note that content can include DrawingML items like images or
charts.
Note the returned paragraph *is divorced from the document body*. Any changes
made to it will not be reflected in the document. It is intended to provide a
familiar container (`Paragraph`) to interrogate for the content preceding this
page-break in the paragraph in which it occured.
Contains the entire hyperlink when this break occurs within a hyperlink.
"""
if self._lastRenderedPageBreak.precedes_all_content:
return None
from docx.text.paragraph import Paragraph
return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)
@property
def following_paragraph_fragment(self) -> Paragraph | None:
"""A "loose" paragraph containing the content following this page-break.
HAS POTENTIALLY SURPRISING BEHAVIORS so read carefully to be sure this is what
you want. This is primarily targeted toward text-extraction use-cases for which
precisely associating text with the page it occurs on is important.
Compare `.preceding_paragraph_fragment` as these two are intended to be used
together.
This value is `None` when no content follows this page-break. This case is
unlikely to occur in practice because Word places even-paragraph-boundary
page-breaks on the paragraph *following* the page-break. Still, it is possible
and must be checked for. Returning `None` for this case avoids "inserting" an
extra, non-existent paragraph into the content stream. Note that content can
include DrawingML items like images or charts, not just text.
The returned paragraph *is divorced from the document body*. Any changes made to
it will not be reflected in the document. It is intended to provide a container
(`Paragraph`) with familiar properties and methods that can be used to
characterize the paragraph content following a mid-paragraph page-break.
Contains no portion of the hyperlink when this break occurs within a hyperlink.
"""
if self._lastRenderedPageBreak.follows_all_content:
return None
from docx.text.paragraph import Paragraph
return Paragraph(self._lastRenderedPageBreak.following_fragment_p, self._parent)