llm¶

`academic_doc_generator.core.llm` ¶

LLM interface with comprehensive type annotations for API interactions.

`detect_degree_from_filename(pdf_path, llm_client)` ¶

Detect if thesis is Bachelor or Master from PDF filename.

Parameters:

Name	Type	Description	Default
`pdf_path`	`str`	Path to the PDF file.	required
`llm_client`	`LLMClientProtocol`	LLMClient instance for API access.	required

Returns:

Name	Type	Description
`str`	`str`	"Bachelor" or "Master", or None if unable to determine.

Source code in src/academic_doc_generator/core/llm.py

def detect_degree_from_filename(pdf_path: str, llm_client: LLMClientProtocol) -> str:
    """Detect if thesis is Bachelor or Master from PDF filename.

    Args:
        pdf_path: Path to the PDF file.
        llm_client: LLMClient instance for API access.

    Returns:
        str: "Bachelor" or "Master", or None if unable to determine.
    """
    import os

    filename = os.path.basename(pdf_path)

    prompt = build_prompt(PromptTemplate.DETECT_DEGREE, filename=filename)

    messages = [{"role": "user", "content": prompt}]
    response = llm_client.chat_completion(messages).strip()

    # Normalize response
    if "bachelor" in response.lower():
        return "Bachelor"
    elif "master" in response.lower():
        return "Master"
    else:
        return None

`detect_language(results, llm_client, groq_free, sample_size=3)` ¶

Detect the language (German or English) of the comments.

Parameters:

Name	Type	Description	Default
`results`	`dict[int, list[RewrittenComment]]`	Dictionary containing rewritten comments per page.	required
`llm_client`	`LLMClientProtocol`	LLM client instance for API access.	required
`groq_free`	`bool`	Whether to apply request throttling (2 second delay).	required
`sample_size`	`int`	Number of sample comments to analyze for language detection. Defaults to 3.	`3`

Returns:

Type	Description
`str`	"German" if German language detected, "English" if English.

Example

comments = {1: [{'rewritten': 'Warum wurde das gewählt?'}]} client = LLMClient() lang = detect_language(comments, client, groq_free=False) lang 'German'

Source code in src/academic_doc_generator/core/llm.py

def detect_language(
    results: dict[int, list[RewrittenComment]],
    llm_client: LLMClientProtocol,
    groq_free: bool,
    sample_size: int = 3,
) -> str:
    """Detect the language (German or English) of the comments.

    Args:
        results: Dictionary containing rewritten comments per page.
        llm_client: LLM client instance for API access.
        groq_free: Whether to apply request throttling (2 second delay).
        sample_size: Number of sample comments to analyze for language detection.
            Defaults to 3.

    Returns:
        "German" if German language detected, "English" if English.

    Example:
        >>> comments = {1: [{'rewritten': 'Warum wurde das gewählt?'}]}
        >>> client = LLMClient()
        >>> lang = detect_language(comments, client, groq_free=False)
        >>> lang
        'German'
    """
    # Collect a few rewritten comments for language detection
    texts: list[str] = []
    for _page, items in results.items():
        for item in items:
            texts.append(item["rewritten"])
            if len(texts) >= sample_size:
                break
        if len(texts) >= sample_size:
            break

    sample_text = "\n".join(texts)

    prompt = build_prompt(PromptTemplate.DETECT_LANGUAGE, text=sample_text)

    messages = [{"role": "user", "content": prompt}]
    lang = llm_client.chat_completion(messages)

    if groq_free:  # wait 2 seconds
        time.sleep(2)

    return lang

`determine_gender_from_name(first_name, llm_client)` ¶

Determine the formal German address (Herr/Frau) from a first name using LLM.

Parameters:

Name	Type	Description	Default
`first_name`	`str`	First/given name of the person.	required
`llm_client`	`LLMClientProtocol`	LLMClient instance for API access.	required

Returns:

Name	Type	Description
`str`	`str`	Either "Herr" or "Frau" based on the name.

Source code in src/academic_doc_generator/core/llm.py

def determine_gender_from_name(first_name: str, llm_client: LLMClientProtocol) -> str:
    """Determine the formal German address (Herr/Frau) from a first name using LLM.

    Args:
        first_name: First/given name of the person.
        llm_client: LLMClient instance for API access.

    Returns:
        str: Either "Herr" or "Frau" based on the name.
    """
    prompt = build_prompt(PromptTemplate.DETERMINE_GENDER, first_name=first_name)

    messages = [{"role": "user", "content": prompt}]
    result = llm_client.chat_completion(messages)

    # Ensure valid output
    if result not in ["Herr", "Frau", "Herr/Frau"]:
        return "Herr/Frau"

    return result

`extract_document_metadata(pages_text, language, llm_client, pdf_path=None)` ¶

Extract author, matriculation number, title, and examiners from the first two pages.

Parameters:

Name	Type	Description	Default
`pages_text`	`dict[int, str]`	Dictionary mapping page indices to text content.	required
`language`	`str`	Language the thesis is written in ("German" or "English").	required
`llm_client`	`LLMClientProtocol`	LLM client instance for API access.	required
`pdf_path`	`str`	Path to PDF file for fallback degree detection from filename.	`None`

Returns:

Type	Description
`ThesisMetadata`	Dictionary with extracted metadata. If any field cannot be extracted,
`ThesisMetadata`	it will contain None as the value.

Example

text = {0: "Bachelor Thesis by Max Mustermann (123456)"} client = LLMClient() metadata = extract_document_metadata(text, "German", client) metadata['author'] 'Max Mustermann' metadata['id_number'] '123456'

Source code in src/academic_doc_generator/core/llm.py

def extract_document_metadata(
    pages_text: dict[int, str],
    language: str,
    llm_client: LLMClientProtocol,
    pdf_path: str = None,
) -> ThesisMetadata:
    """Extract author, matriculation number, title, and examiners from the first two pages.

    Args:
        pages_text: Dictionary mapping page indices to text content.
        language: Language the thesis is written in ("German" or "English").
        llm_client: LLM client instance for API access.
        pdf_path (str, optional): Path to PDF file for fallback degree detection from filename.

    Returns:
        Dictionary with extracted metadata. If any field cannot be extracted,
        it will contain None as the value.

    Example:
        >>> text = {0: "Bachelor Thesis by Max Mustermann (123456)"}
        >>> client = LLMClient()
        >>> metadata = extract_document_metadata(text, "German", client)
        >>> metadata['author']
        'Max Mustermann'
        >>> metadata['id_number']
        '123456'
    """
    # Collect first two pages of text (if available)
    sample_text = "\n\n".join([pages_text.get(i, "") for i in sorted(pages_text.keys())[:2]])

    prompt = build_prompt(PromptTemplate.EXTRACT_METADATA, language=language, text=sample_text)

    messages = [{"role": "user", "content": prompt}]
    content = llm_client.chat_completion(messages)

    try:
        raw_metadata: dict[str, Any] = json.loads(content)
    except json.JSONDecodeError:
        # Return empty metadata or handle error as needed
        # ThesisMetadata is TypedDict(total=False), so {} is valid
        return {}

    # Rename sid to id_number if it exists in the raw LLM response
    if "sid" in raw_metadata and "id_number" not in raw_metadata:
        raw_metadata["id_number"] = raw_metadata.pop("sid")

    metadata: ThesisMetadata = raw_metadata  # type: ignore[assignment]

    # Fallback: Wenn bachelor_master nicht bestimmt werden konnte, versuche es über Dateinamen
    if pdf_path and (
        not metadata.get("bachelor_master") or metadata.get("bachelor_master") is None
    ):
        print("   ⚠️  Bachelor/Master konnte nicht aus Dokument bestimmt werden")
        print("   🔄 Versuche Bestimmung über Dateinamen...")
        degree_from_filename = detect_degree_from_filename(pdf_path, llm_client)
        if degree_from_filename:

            metadata["bachelor_master"] = degree_from_filename  # type: ignore[typeddict-item]
            print(f"   ✅ Aus Dateinamen bestimmt: {degree_from_filename}")
        else:
            print("   ❌ Konnte Bachelor/Master auch nicht aus Dateinamen bestimmen")

    return metadata

`get_summary_and_metadata_of_pdf(pdf_path, language, llm_client=None, groq_free=False, verbose=False)` ¶

Extract thesis metadata and generate a summary from the PDF.

This function uses the first pages of the PDF to detect metadata such as author, matriculation number, thesis title, and examiners, and generates a LaTeX-formatted summary of the thesis content using an LLM.

Parameters:

Name	Type	Description	Default
`pdf_path`	`str`	Path to the thesis PDF.	required
`language`	`str`	Language the thesis is written in ("German" or "English").	required
`llm_client`	`Optional[LLMClientProtocol]`	LLM client instance. If None, creates a new one with automatic API selection.	`None`
`groq_free`	`bool`	Whether to apply request throttling to stay under free-tier rate limits. Adds 20s delay after metadata extraction and 2s delay after summarization. Defaults to False.	`False`
`verbose`	`bool`	If True, prints the generated summary. Defaults to False.	`False`

Returns:

Type	Description
`str`	Tuple of (summary, metadata):
`ThesisMetadata`	summary: LaTeX-formatted summary of the thesis
`tuple[str, ThesisMetadata]`	metadata: Extracted thesis metadata including author, title, examiners

Example

from llm_client import LLMClient client = LLMClient() summary, metadata = get_summary_and_metadata_of_pdf( ... "thesis.pdf", "German", client ... ) metadata['bachelor_master'] 'Bachelor' "untersucht" in summary True

Source code in src/academic_doc_generator/core/llm.py

def get_summary_and_metadata_of_pdf(
    pdf_path: str,
    language: str,
    llm_client: Optional[LLMClientProtocol] = None,
    groq_free: bool = False,
    verbose: bool = False,
) -> tuple[str, ThesisMetadata]:
    """Extract thesis metadata and generate a summary from the PDF.

    This function uses the first pages of the PDF to detect metadata such as
    author, matriculation number, thesis title, and examiners, and generates
    a LaTeX-formatted summary of the thesis content using an LLM.

    Args:
        pdf_path: Path to the thesis PDF.
        language: Language the thesis is written in ("German" or "English").
        llm_client: LLM client instance. If None, creates a new one with
            automatic API selection.
        groq_free: Whether to apply request throttling to stay under
            free-tier rate limits. Adds 20s delay after metadata extraction
            and 2s delay after summarization. Defaults to False.
        verbose: If True, prints the generated summary. Defaults to False.

    Returns:
        Tuple of (summary, metadata):
        - summary: LaTeX-formatted summary of the thesis
        - metadata: Extracted thesis metadata including author, title, examiners

    Example:
        >>> from llm_client import LLMClient
        >>> client = LLMClient()
        >>> summary, metadata = get_summary_and_metadata_of_pdf(
        ...     "thesis.pdf", "German", client
        ... )
        >>> metadata['bachelor_master']
        'Bachelor'
        >>> "untersucht" in summary
        True
    """
    if llm_client is None:
        llm_client = LLMClient()
        print(f"Using LLM API: {llm_client.api_choice} with model: {llm_client.llm}")

    print("Starting to get summary and metadata of the thesis.")

    # get plain text (for metadata detection)
    pages_text = pdf.extract_text_per_page(pdf_path)

    # Extract metadata (mit pdf_path für Fallback)
    metadata = extract_document_metadata(pages_text, language, llm_client, pdf_path=pdf_path)

    if groq_free:
        print("Waiting for 20 seconds to avoid error: Too Many Requests")
        time.sleep(20)

    summary = summarize_thesis(pages_text, language, llm_client)

    if verbose:
        print("Summary of thesis:\n", summary)

    if groq_free:
        time.sleep(2)

    return summary, metadata

`rewrite_comments(context_dict, llm_client, groq_free=False, verbose=False)` ¶

Rewrite rough comments into clear, polite questions using LLMClient.

Only comments categorized as "llm" are rewritten. Comments with category "quelle" or "language" are skipped but retained in the results for later analysis. Comments with category "ignore" are excluded entirely.

Parameters:

Name	Type	Description	Default
`context_dict`	`dict[int, list[AnnotationContext]]`	Mapping of page numbers to annotation contexts, where each annotation dict contains comment, highlighted text, paragraph, and category.	required
`llm_client`	`LLMClientProtocol`	LLM client instance implementing the LLMClientProtocol.	required
`groq_free`	`bool`	Whether to apply request throttling to stay under Groq's free-tier rate limits (4s per request, 10s every 5 requests). Defaults to False.	`False`
`verbose`	`bool`	If True, prints debug information about responses. Defaults to False.	`False`

Returns:

Type	Description
`dict[int, list[RewrittenComment]]`	Dictionary mapping page numbers to rewritten comments. Skipped comments
`dict[int, list[RewrittenComment]]`	(quelle/language) are excluded from the output.

Example

context = {1: [{'comment': 'Why?', 'highlighted': 'text', ... 'paragraph': 'context', 'category': 'llm'}]} client = LLMClient() result = rewrite_comments(context, client) result[1][0]['rewritten'] 'Could you explain the reasoning behind this approach?'

Source code in src/academic_doc_generator/core/llm.py

def rewrite_comments(
    context_dict: dict[int, list[pdf.AnnotationContext]],
    llm_client: LLMClientProtocol,
    groq_free: bool = False,
    verbose: bool = False,
) -> dict[int, list[RewrittenComment]]:
    """Rewrite rough comments into clear, polite questions using LLMClient.

    Only comments categorized as "llm" are rewritten. Comments with category
    "quelle" or "language" are skipped but retained in the results for later
    analysis. Comments with category "ignore" are excluded entirely.

    Args:
        context_dict: Mapping of page numbers to annotation contexts, where each
            annotation dict contains comment, highlighted text, paragraph, and category.
        llm_client: LLM client instance implementing the LLMClientProtocol.
        groq_free: Whether to apply request throttling to stay under Groq's
            free-tier rate limits (4s per request, 10s every 5 requests).
            Defaults to False.
        verbose: If True, prints debug information about responses. Defaults to False.

    Returns:
        Dictionary mapping page numbers to rewritten comments. Skipped comments
        (quelle/language) are excluded from the output.

    Example:
        >>> context = {1: [{'comment': 'Why?', 'highlighted': 'text',
        ...                 'paragraph': 'context', 'category': 'llm'}]}
        >>> client = LLMClient()
        >>> result = rewrite_comments(context, client)
        >>> result[1][0]['rewritten']
        'Could you explain the reasoning behind this approach?'
    """
    rewritten: dict[int, list[RewrittenComment]] = {}

    for page_num, items in context_dict.items():
        rewritten_items: list[RewrittenComment] = []

        if groq_free and (len(rewritten) + 1) % 5 == 0:
            print("Waiting for 10 seconds to avoid error from API: Too Many Requests")
            time.sleep(10)

        for item in items:
            category = item.get("category", "llm")

            # Skip ignored comments and non-LLM categories
            if category != "llm":
                continue

            if groq_free:  # always wait 4 seconds for rate limit of 30 requests per minute
                time.sleep(4)

            comment = item["comment"]
            paragraph = item["paragraph"]
            highlighted = item["highlighted"]

            prompt = build_prompt(
                PromptTemplate.REWRITE_COMMENT,
                paragraph=paragraph,
                highlighted=highlighted,
                comment=comment,
            )

            messages = [{"role": "user", "content": prompt}]
            rewritten_raw = llm_client.chat_completion(messages)

            if verbose:
                print(f"Response: {rewritten_raw}")

            rewritten_text = latex.escape_for_latex(rewritten_raw, preserve_latex=True)

            rewritten_items.append(
                {
                    "original": comment,
                    "rewritten": rewritten_text,
                    "highlighted": highlighted,
                    "paragraph": paragraph,
                    "category": category,
                }
            )

        if rewritten_items:
            rewritten[page_num] = rewritten_items

    return rewritten

`rewrite_comments_in_pdf(pdf_path, llm_client=None, groq_free=False, verbose=False, pdf_processor=None)` ¶

Extract and rewrite PDF comments into clear, polite questions.

This function parses the given PDF, extracts annotations, finds their textual context, and uses an LLM to rewrite rough comments into more understandable, well-phrased questions or feedback.

Parameters:

Name	Type	Description	Default
`pdf_path`	`str`	Path to the PDF file containing comments/annotations.	required
`llm_client`	`Optional[LLMClientProtocol]`	LLM client instance. If None, creates a new one with automatic API selection.	`None`
`groq_free`	`bool`	Whether to apply request throttling to stay under free-tier rate limits. Defaults to False.	`False`
`verbose`	`bool`	If True, prints detailed information about original and rewritten comments. Defaults to False.	`False`
`pdf_processor`	`Any`	Optional PDF processor module for dependency injection in tests. Defaults to None (uses the standard module).	`None`

Returns:

Type	Description
`dict[int, list[RewrittenComment]]`	Tuple of (rewritten_comments, stats):
`CommentStats`	rewritten_comments: Dictionary mapping page numbers (1-based) to lists of rewritten comment dicts
`tuple[dict[int, list[RewrittenComment]], CommentStats]`	stats: Statistics about comment categories (quelle, language, ignore counts)

Example

from llm_client import LLMClient client = LLMClient() rewritten, stats = rewrite_comments_in_pdf("thesis.pdf", client) stats {'quelle': 3, 'language': 2, 'ignore': 0} rewritten[1][0]['category'] 'llm'

Source code in src/academic_doc_generator/core/llm.py

def rewrite_comments_in_pdf(
    pdf_path: str,
    llm_client: Optional[LLMClientProtocol] = None,
    groq_free: bool = False,
    verbose: bool = False,
    pdf_processor: Any = None,  # For dependency injection in tests
) -> tuple[dict[int, list[RewrittenComment]], pdf.CommentStats]:
    """Extract and rewrite PDF comments into clear, polite questions.

    This function parses the given PDF, extracts annotations, finds their
    textual context, and uses an LLM to rewrite rough comments into
    more understandable, well-phrased questions or feedback.

    Args:
        pdf_path: Path to the PDF file containing comments/annotations.
        llm_client: LLM client instance. If None, creates a new one with
            automatic API selection.
        groq_free: Whether to apply request throttling to stay under
            free-tier rate limits. Defaults to False.
        verbose: If True, prints detailed information about original and
            rewritten comments. Defaults to False.
        pdf_processor: Optional PDF processor module for dependency injection
            in tests. Defaults to None (uses the standard module).

    Returns:
        Tuple of (rewritten_comments, stats):
        - rewritten_comments: Dictionary mapping page numbers (1-based) to lists
          of rewritten comment dicts
        - stats: Statistics about comment categories (quelle, language, ignore counts)

    Example:
        >>> from llm_client import LLMClient
        >>> client = LLMClient()
        >>> rewritten, stats = rewrite_comments_in_pdf("thesis.pdf", client)
        >>> stats
        {'quelle': 3, 'language': 2, 'ignore': 0}
        >>> rewritten[1][0]['category']
        'llm'
    """
    if llm_client is None:
        llm_client = LLMClient()
        print(f"Using LLM API: {llm_client.api_choice} with model: {llm_client.llm}")

    if pdf_processor is None:
        from . import pdf as pdf_proc

        extract_text_with_positions = pdf_proc.extract_text_with_positions
        extract_annotations_with_positions = pdf_proc.extract_annotations_with_positions
        find_annotation_context = pdf_proc.find_annotation_context
    else:
        extract_text_with_positions = pdf_processor.extract_text_with_positions
        extract_annotations_with_positions = pdf_processor.extract_annotations_with_positions
        find_annotation_context = pdf_processor.find_annotation_context

    print(f"Starting to rewrite comments in the thesis {pdf_path}")

    pages_words = extract_text_with_positions(pdf_path)
    annotations, stats = extract_annotations_with_positions(pdf_path)
    context_dict = find_annotation_context(pages_words, annotations)
    comments_rewritten = rewrite_comments(context_dict, llm_client, groq_free)

    if verbose:
        print(stats)
        for page, items in comments_rewritten.items():
            print(f"\n--- Page {page} ---")
            for item in items:
                print("Original:", item["original"])
                print("Rewritten:", item["rewritten"])
                print("Highlighted:", item["highlighted"])
                print("Paragraph:", item["paragraph"])
                print()

    return comments_rewritten, stats

`summarize_thesis(pages_text, language, llm_client)` ¶

Summarize the thesis from the first 10 pages in LaTeX-friendly format.

Parameters:

Name	Type	Description	Default
`pages_text`	`dict[int, str]`	Dictionary mapping page indices to text content.	required
`language`	`str`	Language the thesis is written in ("German" or "English").	required
`llm_client`	`LLMClientProtocol`	LLM client instance for API access.	required

Returns:

Type	Description
`str`	A LaTeX-formatted summary string with escaped special characters.

Example

text = {0: "This thesis examines...", 1: "The methodology..."} client = LLMClient() summary = summarize_thesis(text, "German", client) "untersucht" in summary True "\\" in summary # LaTeX line breaks True

Source code in src/academic_doc_generator/core/llm.py

def summarize_thesis(
    pages_text: dict[int, str], language: str, llm_client: LLMClientProtocol
) -> str:
    """Summarize the thesis from the first 10 pages in LaTeX-friendly format.

    Args:
        pages_text: Dictionary mapping page indices to text content.
        language: Language the thesis is written in ("German" or "English").
        llm_client: LLM client instance for API access.

    Returns:
        A LaTeX-formatted summary string with escaped special characters.

    Example:
        >>> text = {0: "This thesis examines...", 1: "The methodology..."}
        >>> client = LLMClient()
        >>> summary = summarize_thesis(text, "German", client)
        >>> "untersucht" in summary
        True
        >>> "\\\\\\" in summary  # LaTeX line breaks
        True
    """
    full_text = "\n\n".join([pages_text.get(i, "") for i in sorted(pages_text.keys())])

    prompt = build_prompt(PromptTemplate.SUMMARIZE_THESIS, language=language, text=full_text)

    messages = [{"role": "user", "content": prompt}]
    latex_summary_raw = llm_client.chat_completion(messages)

    return latex.escape_for_latex(latex_summary_raw, preserve_latex=True)

llm¶

academic_doc_generator.core.llm ¶

detect_degree_from_filename(pdf_path, llm_client) ¶

detect_language(results, llm_client, groq_free, sample_size=3) ¶

determine_gender_from_name(first_name, llm_client) ¶

extract_document_metadata(pages_text, language, llm_client, pdf_path=None) ¶

get_summary_and_metadata_of_pdf(pdf_path, language, llm_client=None, groq_free=False, verbose=False) ¶

rewrite_comments(context_dict, llm_client, groq_free=False, verbose=False) ¶

rewrite_comments_in_pdf(pdf_path, llm_client=None, groq_free=False, verbose=False, pdf_processor=None) ¶

summarize_thesis(pages_text, language, llm_client) ¶

`academic_doc_generator.core.llm` ¶

`detect_degree_from_filename(pdf_path, llm_client)` ¶

`detect_language(results, llm_client, groq_free, sample_size=3)` ¶

`determine_gender_from_name(first_name, llm_client)` ¶

`extract_document_metadata(pages_text, language, llm_client, pdf_path=None)` ¶

`get_summary_and_metadata_of_pdf(pdf_path, language, llm_client=None, groq_free=False, verbose=False)` ¶

`rewrite_comments(context_dict, llm_client, groq_free=False, verbose=False)` ¶

`rewrite_comments_in_pdf(pdf_path, llm_client=None, groq_free=False, verbose=False, pdf_processor=None)` ¶

`summarize_thesis(pages_text, language, llm_client)` ¶