Zum Inhalt

md_generator

academic_doc_generator.review.md_generator

create_review_markdown(rewritten, output_file)

Create a Markdown review document from rewritten comments.

Parameters:

Name Type Description Default
rewritten dict[int, list[dict]]

Dictionary of rewritten comments with page/line info.

required
output_file str

Path to the markdown file.

required
Source code in src/academic_doc_generator/review/md_generator.py
def create_review_markdown(rewritten: dict[int, list[dict]], output_file: str):
    """Create a Markdown review document from rewritten comments.

    Args:
        rewritten: Dictionary of rewritten comments with page/line info.
        output_file: Path to the markdown file.
    """
    lines = [
        "# Peer Review",
        "",
        "Dear authors,",
        "",
        "here are my comments on your manuscript:",
        "",
    ]
    for page, items in rewritten.items():
        for item in items:
            lines.append(f"- Page {page}, Line {item['line']}: {item['rewritten']}")
    text = "\n".join(lines)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Markdown review saved to {output_file}")

estimate_line_number(y_coord, page_height, line_height=12.0)

Estimate the line number of a comment based on its y-coordinate.

Parameters:

Name Type Description Default
y_coord float

Y-coordinate of the annotation rectangle (PDF origin is bottom-left).

required
page_height float

Total height of the PDF page.

required
line_height float

Approximate line spacing in points (default 12pt).

12.0

Returns:

Type Description
int

Estimated line number (1-based).

Source code in src/academic_doc_generator/review/md_generator.py
def estimate_line_number(y_coord: float, page_height: float, line_height: float = 12.0) -> int:
    """Estimate the line number of a comment based on its y-coordinate.

    Args:
        y_coord: Y-coordinate of the annotation rectangle (PDF origin is bottom-left).
        page_height: Total height of the PDF page.
        line_height: Approximate line spacing in points (default 12pt).

    Returns:
        Estimated line number (1-based).
    """
    # PDF y=0 is bottom, so invert
    distance_from_top = page_height - y_coord
    return max(1, int(distance_from_top / line_height) + 1)

find_annotation_context_with_lines(pages_words, annotations, page_heights)

Like find_annotation_context, but also attach estimated line numbers.

Parameters:

Name Type Description Default
pages_words dict

Dictionary of words with positions per page.

required
annotations dict

Extracted annotations per page.

required
page_heights dict

Mapping page index -> page height in points.

required

Returns:

Type Description
dict[int, list[dict]]

Dict mapping page numbers to list of annotations with line info.

Source code in src/academic_doc_generator/review/md_generator.py
def find_annotation_context_with_lines(
    pages_words: dict, annotations: dict, page_heights: dict
) -> dict[int, list[dict]]:
    """Like find_annotation_context, but also attach estimated line numbers.

    Args:
        pages_words: Dictionary of words with positions per page.
        annotations: Extracted annotations per page.
        page_heights: Mapping page index -> page height in points.

    Returns:
        Dict mapping page numbers to list of annotations with line info.
    """
    context_dict = {}
    for page_num, annots in annotations.items():
        page_results = []
        for annot in annots:
            rect = annot["rect"]
            if not rect:
                continue
            x0, y0, x1, y1 = rect

            line_number = find_line_number_from_text(pages_words.get(page_num, []), rect)

            if line_number == -1:
                # fallback to geometric estimation
                line_number = estimate_line_number(y1, page_heights[page_num])  # top of rect

            page_results.append(
                {
                    "comment": annot["comment"],
                    "highlighted": "",
                    "paragraph": "",
                    "category": annot.get("category", "llm"),
                    "line": line_number,
                }
            )
        if page_results:
            context_dict[page_num + 1] = page_results
    return context_dict

find_line_number_from_text(words, annot_bbox, x_threshold=20.0)

Try to find a printed line number near the annotation by scanning words at the left margin of the page.

Parameters:

Name Type Description Default
words list

List of word dicts with "text" and "bbox".

required
annot_bbox tuple

(x0, y0, x1, y1) of the annotation.

required
x_threshold float

Max x-position to still be considered a margin line number.

20.0

Returns:

Name Type Description
int int

Detected line number, or -1 if none found.

Source code in src/academic_doc_generator/review/md_generator.py
def find_line_number_from_text(words: list, annot_bbox: tuple, x_threshold: float = 20.0) -> int:
    """Try to find a printed line number near the annotation by scanning
    words at the left margin of the page.

    Args:
        words: List of word dicts with "text" and "bbox".
        annot_bbox: (x0, y0, x1, y1) of the annotation.
        x_threshold: Max x-position to still be considered a margin line number.

    Returns:
        int: Detected line number, or -1 if none found.
    """
    ax0, ay0, ax1, ay1 = annot_bbox
    candidate = None

    for w in words:
        wx0, wy0, wx1, wy1 = w["bbox"]
        # Check if word is at left margin (x position),
        # vertically overlaps with annotation, and is a number
        if wx0 < x_threshold and wy0 <= ay1 and wy1 >= ay0 and w["text"].isdigit():
            candidate = int(w["text"])
            break

    return candidate if candidate is not None else -1

rewrite_comments_markdown(context_dict, llm_client, groq_free=False, verbose=False)

Rewrite comments for peer review (Markdown output).

Parameters:

Name Type Description Default
context_dict dict[int, list[dict]]

Mapping page numbers to annotation dicts including "line".

required
llm_client LLMClient

LLMClient instance for API access.

required
groq_free bool

Whether to apply throttling for free-tier.

False
verbose bool

Print debugging info.

False

Returns:

Type Description
dict[int, list[dict]]

Dict with rewritten comments per page.

Source code in src/academic_doc_generator/review/md_generator.py
def rewrite_comments_markdown(
    context_dict: dict[int, list[dict]],
    llm_client: LLMClient,
    groq_free: bool = False,
    verbose: bool = False,
) -> dict[int, list[dict]]:
    """Rewrite comments for peer review (Markdown output).

    Args:
        context_dict: Mapping page numbers to annotation dicts including "line".
        llm_client: LLMClient instance for API access.
        groq_free: Whether to apply throttling for free-tier.
        verbose: Print debugging info.

    Returns:
        Dict with rewritten comments per page.
    """
    rewritten: dict[int, list[dict]] = {}

    for page_num, items in context_dict.items():
        rewritten_items = []

        if groq_free and (len(rewritten) + 1) % 5 == 0:
            print("Waiting 10s for free-tier rate limit")
            time.sleep(10)

        for item in items:
            if item["category"] != "llm":
                continue  # skip non-LLM comments (Quelle, etc.)

            comment = item["comment"]
            paragraph = item.get("paragraph", "")
            highlighted = item.get("highlighted", "")

            prompt = build_prompt(
                PromptTemplate.REWRITE_COMMENT_MARKDOWN,
                paragraph=paragraph,
                highlighted=highlighted,
                comment=comment,
            )

            messages = [{"role": "user", "content": prompt}]
            rewritten_raw = llm_client.chat_completion(messages)

            rewritten_items.append(
                {
                    "original": comment,
                    "rewritten": rewritten_raw,
                    "line": item["line"],
                    "page": page_num,
                }
            )

        if rewritten_items:
            rewritten[page_num] = rewritten_items

    if verbose:
        print(rewritten)

    return rewritten