md_generator¶

`academic_doc_generator.review.md_generator` ¶

`create_review_markdown(rewritten, output_file)` ¶

Create a Markdown review document from rewritten comments.

Parameters:

Name	Type	Description	Default
`rewritten`	`dict[int, list[dict]]`	Dictionary of rewritten comments with page/line info.	required
`output_file`	`str`	Path to the markdown file.	required

Source code in src/academic_doc_generator/review/md_generator.py

def create_review_markdown(rewritten: dict[int, list[dict]], output_file: str):
    """Create a Markdown review document from rewritten comments.

    Args:
        rewritten: Dictionary of rewritten comments with page/line info.
        output_file: Path to the markdown file.
    """
    lines = [
        "# Peer Review",
        "",
        "Dear authors,",
        "",
        "here are my comments on your manuscript:",
        "",
    ]
    for page, items in rewritten.items():
        for item in items:
            lines.append(f"- Page {page}, Line {item['line']}: {item['rewritten']}")
    text = "\n".join(lines)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Markdown review saved to {output_file}")

`estimate_line_number(y_coord, page_height, line_height=12.0)` ¶

Estimate the line number of a comment based on its y-coordinate.

Parameters:

Name	Type	Description	Default
`y_coord`	`float`	Y-coordinate of the annotation rectangle (PDF origin is bottom-left).	required
`page_height`	`float`	Total height of the PDF page.	required
`line_height`	`float`	Approximate line spacing in points (default 12pt).	`12.0`

Returns:

Type	Description
`int`	Estimated line number (1-based).

Source code in src/academic_doc_generator/review/md_generator.py

def estimate_line_number(y_coord: float, page_height: float, line_height: float = 12.0) -> int:
    """Estimate the line number of a comment based on its y-coordinate.

    Args:
        y_coord: Y-coordinate of the annotation rectangle (PDF origin is bottom-left).
        page_height: Total height of the PDF page.
        line_height: Approximate line spacing in points (default 12pt).

    Returns:
        Estimated line number (1-based).
    """
    # PDF y=0 is bottom, so invert
    distance_from_top = page_height - y_coord
    return max(1, int(distance_from_top / line_height) + 1)

`find_annotation_context_with_lines(pages_words, annotations, page_heights)` ¶

Like find_annotation_context, but also attach estimated line numbers.

Parameters:

Name	Type	Description	Default
`pages_words`	`dict`	Dictionary of words with positions per page.	required
`annotations`	`dict`	Extracted annotations per page.	required
`page_heights`	`dict`	Mapping page index -> page height in points.	required

Returns:

Type	Description
`dict[int, list[dict]]`	Dict mapping page numbers to list of annotations with line info.

Source code in src/academic_doc_generator/review/md_generator.py

def find_annotation_context_with_lines(
    pages_words: dict, annotations: dict, page_heights: dict
) -> dict[int, list[dict]]:
    """Like find_annotation_context, but also attach estimated line numbers.

    Args:
        pages_words: Dictionary of words with positions per page.
        annotations: Extracted annotations per page.
        page_heights: Mapping page index -> page height in points.

    Returns:
        Dict mapping page numbers to list of annotations with line info.
    """
    context_dict = {}
    for page_num, annots in annotations.items():
        page_results = []
        for annot in annots:
            rect = annot["rect"]
            if not rect:
                continue
            x0, y0, x1, y1 = rect

            line_number = find_line_number_from_text(pages_words.get(page_num, []), rect)

            if line_number == -1:
                # fallback to geometric estimation
                line_number = estimate_line_number(y1, page_heights[page_num])  # top of rect

            page_results.append(
                {
                    "comment": annot["comment"],
                    "highlighted": "",
                    "paragraph": "",
                    "category": annot.get("category", "llm"),
                    "line": line_number,
                }
            )
        if page_results:
            context_dict[page_num + 1] = page_results
    return context_dict

`find_line_number_from_text(words, annot_bbox, x_threshold=20.0)` ¶

Try to find a printed line number near the annotation by scanning words at the left margin of the page.

Parameters:

Name	Type	Description	Default
`words`	`list`	List of word dicts with "text" and "bbox".	required
`annot_bbox`	`tuple`	(x0, y0, x1, y1) of the annotation.	required
`x_threshold`	`float`	Max x-position to still be considered a margin line number.	`20.0`

Returns:

Name	Type	Description
`int`	`int`	Detected line number, or -1 if none found.

Source code in src/academic_doc_generator/review/md_generator.py

def find_line_number_from_text(words: list, annot_bbox: tuple, x_threshold: float = 20.0) -> int:
    """Try to find a printed line number near the annotation by scanning
    words at the left margin of the page.

    Args:
        words: List of word dicts with "text" and "bbox".
        annot_bbox: (x0, y0, x1, y1) of the annotation.
        x_threshold: Max x-position to still be considered a margin line number.

    Returns:
        int: Detected line number, or -1 if none found.
    """
    ax0, ay0, ax1, ay1 = annot_bbox
    candidate = None

    for w in words:
        wx0, wy0, wx1, wy1 = w["bbox"]
        # Check if word is at left margin (x position),
        # vertically overlaps with annotation, and is a number
        if wx0 < x_threshold and wy0 <= ay1 and wy1 >= ay0 and w["text"].isdigit():
            candidate = int(w["text"])
            break

    return candidate if candidate is not None else -1

`rewrite_comments_markdown(context_dict, llm_client, groq_free=False, verbose=False)` ¶

Rewrite comments for peer review (Markdown output).

Parameters:

Name	Type	Description	Default
`context_dict`	`dict[int, list[dict]]`	Mapping page numbers to annotation dicts including "line".	required
`llm_client`	`LLMClient`	LLMClient instance for API access.	required
`groq_free`	`bool`	Whether to apply throttling for free-tier.	`False`
`verbose`	`bool`	Print debugging info.	`False`

Returns:

Type	Description
`dict[int, list[dict]]`	Dict with rewritten comments per page.

Source code in src/academic_doc_generator/review/md_generator.py

def rewrite_comments_markdown(
    context_dict: dict[int, list[dict]],
    llm_client: LLMClient,
    groq_free: bool = False,
    verbose: bool = False,
) -> dict[int, list[dict]]:
    """Rewrite comments for peer review (Markdown output).

    Args:
        context_dict: Mapping page numbers to annotation dicts including "line".
        llm_client: LLMClient instance for API access.
        groq_free: Whether to apply throttling for free-tier.
        verbose: Print debugging info.

    Returns:
        Dict with rewritten comments per page.
    """
    rewritten: dict[int, list[dict]] = {}

    for page_num, items in context_dict.items():
        rewritten_items = []

        if groq_free and (len(rewritten) + 1) % 5 == 0:
            print("Waiting 10s for free-tier rate limit")
            time.sleep(10)

        for item in items:
            if item["category"] != "llm":
                continue  # skip non-LLM comments (Quelle, etc.)

            comment = item["comment"]
            paragraph = item.get("paragraph", "")
            highlighted = item.get("highlighted", "")

            prompt = build_prompt(
                PromptTemplate.REWRITE_COMMENT_MARKDOWN,
                paragraph=paragraph,
                highlighted=highlighted,
                comment=comment,
            )

            messages = [{"role": "user", "content": prompt}]
            rewritten_raw = llm_client.chat_completion(messages)

            rewritten_items.append(
                {
                    "original": comment,
                    "rewritten": rewritten_raw,
                    "line": item["line"],
                    "page": page_num,
                }
            )

        if rewritten_items:
            rewritten[page_num] = rewritten_items

    if verbose:
        print(rewritten)

    return rewritten

md_generator¶

academic_doc_generator.review.md_generator ¶

create_review_markdown(rewritten, output_file) ¶

estimate_line_number(y_coord, page_height, line_height=12.0) ¶

find_annotation_context_with_lines(pages_words, annotations, page_heights) ¶

find_line_number_from_text(words, annot_bbox, x_threshold=20.0) ¶

rewrite_comments_markdown(context_dict, llm_client, groq_free=False, verbose=False) ¶

`academic_doc_generator.review.md_generator` ¶

`create_review_markdown(rewritten, output_file)` ¶

`estimate_line_number(y_coord, page_height, line_height=12.0)` ¶

`find_annotation_context_with_lines(pages_words, annotations, page_heights)` ¶

`find_line_number_from_text(words, annot_bbox, x_threshold=20.0)` ¶

`rewrite_comments_markdown(context_dict, llm_client, groq_free=False, verbose=False)` ¶