Skip to content

metadata

academic_doc_generator.core.metadata

Generation of web-compatible metadata files for student projects.

generate_metadata_file(output_folder, title, author, pages_text, llm_client, work_type, semester, date_str=None, copy_to_web_folder=True, students=None)

Create a Jekyll-style Markdown file with frontmatter for a student project.

Parameters:

Name Type Description Default
output_folder str

Folder where the .md file will be saved.

required
title str

Title of the work.

required
author str

Full name of the student (or first student in group).

required
pages_text dict[int, str]

Extracted text from the PDF.

required
llm_client LLMClientProtocol

LLM client for summary generation.

required
work_type str

String indicating the type of work.

required
semester str

Semester name (e.g., "Wintersemester 24/25").

required
date_str Optional[str]

Date of the work (YYYY-MM-DD). Defaults to today.

None
copy_to_web_folder bool

Whether to copy the file to the global web metadata folder.

True
students Optional[list[StudentInfo]]

Optional list of student info for group projects.

None

Returns:

Type Description
str

Path to the generated .md file.

Source code in src/academic_doc_generator/core/metadata.py
def generate_metadata_file(
    output_folder: str,
    title: str,
    author: str,
    pages_text: dict[int, str],
    llm_client: LLMClientProtocol,
    work_type: str,  # e.g., "Bachelorthesis", "Masterthesis", "Praxisprojekt"
    semester: str,
    date_str: Optional[str] = None,
    copy_to_web_folder: bool = True,
    students: Optional[list[StudentInfo]] = None,
) -> str:
    """Create a Jekyll-style Markdown file with frontmatter for a student project.

    Args:
        output_folder: Folder where the .md file will be saved.
        title: Title of the work.
        author: Full name of the student (or first student in group).
        pages_text: Extracted text from the PDF.
        llm_client: LLM client for summary generation.
        work_type: String indicating the type of work.
        semester: Semester name (e.g., "Wintersemester 24/25").
        date_str: Date of the work (YYYY-MM-DD). Defaults to today.
        copy_to_web_folder: Whether to copy the file to the global web metadata folder.
        students: Optional list of student info for group projects.

    Returns:
        Path to the generated .md file.
    """
    if not date_str:
        date_str = datetime.now().strftime("%Y-%m-%d")

    summary = summarize_for_web(pages_text, llm_client)

    if students and len(students) > 1:
        initials_list = [get_initials(s.get("name", "U. A.")) for s in students]
        initials = " & ".join(initials_list)
        slugs_list = [get_author_slug(s.get("name", "unkn")) for s in students]
        author_slug = "_".join(slugs_list)

        # Replace all author names with their respective initials in the summary
        for s in students:
            s_name = s.get("name")
            if s_name and s_name not in ["Unknown Author", "Unknown", "Unbekannt"]:
                summary = summary.replace(s_name, get_initials(s_name))
    else:
        initials = get_initials(author)
        author_slug = get_author_slug(author)
        # Clean up summary: replace full author name with initials if it appears
        if author and author not in ["Unknown Author", "Unknown", "Unbekannt"]:
            summary = summary.replace(author, initials)

    # Create .md filename
    # format: {year}_{semester_slug}_{type_slug}_{author_slug}.md
    year = date_str[:4]

    # Try to make a slug from semester
    sem_slug = (
        semester.lower()
        .replace(" ", "")
        .replace("/", "")
        .replace("wintersemester", "ws")
        .replace("sommersemester", "ss")
    )

    type_slug = work_type[:2].lower()

    md_filename = f"{year}_{sem_slug}_{type_slug}_{author_slug}.md"
    md_path = os.path.join(output_folder, md_filename)

    content = f"""---
title: "{title}"
author: "{initials}"
date: "{date_str}"
excerpt: |
  {summary}
collection: student_projects
type: "{work_type}"
semester: "{semester}"
---

{summary}
"""
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(content)

    # Copy to global web metadata folder if configured and requested
    if copy_to_web_folder:
        global_config = load_global_config()
        web_metadata_folder = global_config.get("web_metadata_folder")
        if web_metadata_folder:
            try:
                os.makedirs(web_metadata_folder, exist_ok=True)
                shutil.copy2(md_path, os.path.join(web_metadata_folder, md_filename))
                print(f"✅ Web metadata also copied to: {web_metadata_folder}")
            except Exception as e:
                print(f"⚠️  Error copying web metadata: {e}")

    return md_path

get_author_slug(name)

Generate a short slug from the author's name.

Parameters:

Name Type Description Default
name str

Full name of the person.

required

Returns:

Type Description
str

A 4-character lowercase slug.

Source code in src/academic_doc_generator/core/metadata.py
def get_author_slug(name: str) -> str:
    """Generate a short slug from the author's name.

    Args:
        name: Full name of the person.

    Returns:
        A 4-character lowercase slug.
    """
    if not name or name in ["Unknown Author", "Unknown", "Unbekannt"]:
        return "unkn"
    # Replace hyphens with spaces
    name_clean = name.replace("-", " ")
    parts = name_clean.split()
    if len(parts) >= 2:
        first = parts[0][:2].lower()
        last = parts[-1][:2].lower()
        return first + last
    elif len(parts) == 1:
        return parts[0][:4].lower()
    return "unkn"

get_initials(name)

Generate initials for a name (e.g., 'Max Mustermann' -> 'M. M.').

Parameters:

Name Type Description Default
name str

Full name of the person.

required

Returns:

Type Description
str

String containing uppercase initials followed by dots.

Source code in src/academic_doc_generator/core/metadata.py
def get_initials(name: str) -> str:
    """Generate initials for a name (e.g., 'Max Mustermann' -> 'M. M.').

    Args:
        name: Full name of the person.

    Returns:
        String containing uppercase initials followed by dots.
    """
    if not name or name in ["Unknown Author", "Unknown", "Unbekannt"]:
        return "U. A."
    # Replace hyphens with spaces to treat as separate parts
    name_clean = name.replace("-", " ")
    parts = name_clean.split()
    initials = [p[0].upper() + "." for p in parts if p]
    return " ".join(initials)

summarize_for_web(pages_text, llm_client)

Generate a concise, English summary suitable for publication on a website.

Parameters:

Name Type Description Default
pages_text dict[int, str]

Mapping of page indices to text content.

required
llm_client LLMClientProtocol

LLM client for summary generation.

required

Returns:

Type Description
str

A short English summary string.

Source code in src/academic_doc_generator/core/metadata.py
def summarize_for_web(pages_text: dict[int, str], llm_client: LLMClientProtocol) -> str:
    """Generate a concise, English summary suitable for publication on a website.

    Args:
        pages_text: Mapping of page indices to text content.
        llm_client: LLM client for summary generation.

    Returns:
        A short English summary string.
    """
    # Use first 10 pages as in the original script
    text_to_summarize = "\n\n".join([pages_text.get(i, "") for i in sorted(pages_text.keys())[:10]])

    prompt = build_prompt(PromptTemplate.SUMMARIZE_FOR_WEB, text=text_to_summarize)
    messages = [{"role": "user", "content": prompt}]
    return llm_client.chat_completion(messages).strip()