Skip to content

API Reference

This page contains the automatically generated API documentation for the Semantic Backup Explorer.

semantic_backup_explorer.core.backup_operations

Core business logic for backup operations.

Classes

BackupComparisonResult dataclass

Result of comparing a local folder with its backup.

Source code in semantic_backup_explorer/core/backup_operations.py
@dataclass
class BackupComparisonResult:
    """Result of comparing a local folder with its backup."""

    local_path: Path
    backup_path: Optional[Path]
    only_local: list[str]
    only_backup: list[str]
    in_both: list[str]
    error: Optional[str] = None

BackupOperations

High-level operations for backup management.

Source code in semantic_backup_explorer/core/backup_operations.py
class BackupOperations:
    """High-level operations for backup management."""

    def __init__(self, index_path: Path, rag_pipeline: Optional["RAGPipeline"] = None):
        """
        Initialize BackupOperations.

        Args:
            index_path: Path to the backup index file.
            rag_pipeline: Optional RAG pipeline for semantic folder matching.
        """
        self.index_path = index_path
        self.rag_pipeline = rag_pipeline

    def verify_backup_drive(self) -> tuple[bool, Optional[str]]:
        """
        Verifies if the currently connected drive matches the one in the index.

        Returns:
            A tuple of (is_correct, error_message).
        """
        metadata = get_index_metadata(self.index_path)
        if not metadata.root_path:
            return False, "Kein gültiger Index gefunden."

        if not metadata.root_path.exists():
            return False, f"Das Backup-Laufwerk ({metadata.root_path}) ist nicht angeschlossen."

        if metadata.label:
            current_label = get_volume_label(metadata.root_path)
            if current_label and current_label != metadata.label:
                return (
                    False,
                    f"Laufwerks-Konflikt! Erwartetes Label: '{metadata.label}', "
                    f"Gefundenes Label: '{current_label}'. "
                    "Bitte schließe das richtige Laufwerk an oder erstelle einen neuen Index.",
                )

        return True, None

    def find_and_compare(self, local_path: Path) -> BackupComparisonResult:
        """
        Finds the matching backup folder and compares contents.

        Args:
            local_path: The local folder to compare.

        Returns:
            A BackupComparisonResult object.
        """
        # First verify the drive
        is_correct, error = self.verify_backup_drive()
        if not is_correct:
            return BackupComparisonResult(
                local_path=local_path,
                backup_path=None,
                only_local=[],
                only_backup=[],
                in_both=[],
                error=error,
            )

        if not local_path.exists():
            return BackupComparisonResult(
                local_path=local_path,
                backup_path=None,
                only_local=[],
                only_backup=[],
                in_both=[],
                error=f"Local path does not exist: {local_path}",
            )

        folder_name = local_path.name or str(local_path)
        backup_folder_str = find_backup_folder(folder_name, self.index_path)

        # Fallback to RAG if enabled and no direct match found
        if not backup_folder_str and self.rag_pipeline is not None:
            logger.info(f"No direct match for {folder_name}, trying RAG search...")
            backup_folder_str = self._rag_search(folder_name)

        if not backup_folder_str:
            return BackupComparisonResult(
                local_path=local_path,
                backup_path=None,
                only_local=[],  # Could potentially list all local files here
                only_backup=[],
                in_both=[],
                error=f"No matching backup folder found for {folder_name}",
            )

        backup_path = Path(backup_folder_str)
        backup_files = get_all_files_from_index(backup_folder_str, self.index_path)
        diff: FolderDiffResult = compare_folders(local_path, backup_files)

        return BackupComparisonResult(
            local_path=local_path,
            backup_path=backup_path,
            only_local=diff["only_local"],
            only_backup=diff["only_backup"],
            in_both=diff["in_both"],
        )

    def _rag_search(self, folder_name: str) -> Optional[str]:
        """
        Search for a folder using the RAG pipeline.

        Args:
            folder_name: The folder name to search for.

        Returns:
            The path of the most likely matching folder, or None.
        """
        if not self.rag_pipeline:
            return None

        try:
            # We ask the RAG pipeline specifically for the path
            question = f"In welchem Ordner im Backup befinden sich die Dateien für '{folder_name}'? Nenne nur den Pfad."
            answer, _ = self.rag_pipeline.answer_question(question)

            # Simple extraction from answer
            potential_path = answer.strip().split("\n")[0].strip()
            if potential_path.startswith("## "):
                potential_path = potential_path[3:]

            if "/" in potential_path or "\\" in potential_path:
                return potential_path
        except Exception as e:
            logger.error(f"Error during RAG search for folder: {e}")

        return None
Functions
__init__(index_path, rag_pipeline=None)

Initialize BackupOperations.

Parameters:

Name Type Description Default
index_path Path

Path to the backup index file.

required
rag_pipeline Optional[RAGPipeline]

Optional RAG pipeline for semantic folder matching.

None
Source code in semantic_backup_explorer/core/backup_operations.py
def __init__(self, index_path: Path, rag_pipeline: Optional["RAGPipeline"] = None):
    """
    Initialize BackupOperations.

    Args:
        index_path: Path to the backup index file.
        rag_pipeline: Optional RAG pipeline for semantic folder matching.
    """
    self.index_path = index_path
    self.rag_pipeline = rag_pipeline
find_and_compare(local_path)

Finds the matching backup folder and compares contents.

Parameters:

Name Type Description Default
local_path Path

The local folder to compare.

required

Returns:

Type Description
BackupComparisonResult

A BackupComparisonResult object.

Source code in semantic_backup_explorer/core/backup_operations.py
def find_and_compare(self, local_path: Path) -> BackupComparisonResult:
    """
    Finds the matching backup folder and compares contents.

    Args:
        local_path: The local folder to compare.

    Returns:
        A BackupComparisonResult object.
    """
    # First verify the drive
    is_correct, error = self.verify_backup_drive()
    if not is_correct:
        return BackupComparisonResult(
            local_path=local_path,
            backup_path=None,
            only_local=[],
            only_backup=[],
            in_both=[],
            error=error,
        )

    if not local_path.exists():
        return BackupComparisonResult(
            local_path=local_path,
            backup_path=None,
            only_local=[],
            only_backup=[],
            in_both=[],
            error=f"Local path does not exist: {local_path}",
        )

    folder_name = local_path.name or str(local_path)
    backup_folder_str = find_backup_folder(folder_name, self.index_path)

    # Fallback to RAG if enabled and no direct match found
    if not backup_folder_str and self.rag_pipeline is not None:
        logger.info(f"No direct match for {folder_name}, trying RAG search...")
        backup_folder_str = self._rag_search(folder_name)

    if not backup_folder_str:
        return BackupComparisonResult(
            local_path=local_path,
            backup_path=None,
            only_local=[],  # Could potentially list all local files here
            only_backup=[],
            in_both=[],
            error=f"No matching backup folder found for {folder_name}",
        )

    backup_path = Path(backup_folder_str)
    backup_files = get_all_files_from_index(backup_folder_str, self.index_path)
    diff: FolderDiffResult = compare_folders(local_path, backup_files)

    return BackupComparisonResult(
        local_path=local_path,
        backup_path=backup_path,
        only_local=diff["only_local"],
        only_backup=diff["only_backup"],
        in_both=diff["in_both"],
    )
verify_backup_drive()

Verifies if the currently connected drive matches the one in the index.

Returns:

Type Description
tuple[bool, Optional[str]]

A tuple of (is_correct, error_message).

Source code in semantic_backup_explorer/core/backup_operations.py
def verify_backup_drive(self) -> tuple[bool, Optional[str]]:
    """
    Verifies if the currently connected drive matches the one in the index.

    Returns:
        A tuple of (is_correct, error_message).
    """
    metadata = get_index_metadata(self.index_path)
    if not metadata.root_path:
        return False, "Kein gültiger Index gefunden."

    if not metadata.root_path.exists():
        return False, f"Das Backup-Laufwerk ({metadata.root_path}) ist nicht angeschlossen."

    if metadata.label:
        current_label = get_volume_label(metadata.root_path)
        if current_label and current_label != metadata.label:
            return (
                False,
                f"Laufwerks-Konflikt! Erwartetes Label: '{metadata.label}', "
                f"Gefundenes Label: '{current_label}'. "
                "Bitte schließe das richtige Laufwerk an oder erstelle einen neuen Index.",
            )

    return True, None

Functions

semantic_backup_explorer.indexer.scan_backup

Module for scanning backup directories and creating a markdown index.

Functions

scan_backup(root_path, output_file='data/backup_index.md', callback=None)

Recursively scans the root_path and writes every file and folder with its full path into a structured markdown file.

Parameters:

Name Type Description Default
root_path str | Path

Path to the backup directory to scan.

required
output_file str | Path

Path to the output markdown file.

'data/backup_index.md'
callback Optional[Callable[[int, str], None]]

Optional callback function called with (count, current_root).

None

Raises:

Type Description
FileNotFoundError

If root_path does not exist.

NotADirectoryError

If root_path is not a directory.

PermissionError

If output_file cannot be written.

Source code in semantic_backup_explorer/indexer/scan_backup.py
def scan_backup(
    root_path: str | Path,
    output_file: str | Path = "data/backup_index.md",
    callback: Optional[Callable[[int, str], None]] = None,
) -> None:
    """
    Recursively scans the root_path and writes every file and folder
    with its full path into a structured markdown file.

    Args:
        root_path: Path to the backup directory to scan.
        output_file: Path to the output markdown file.
        callback: Optional callback function called with (count, current_root).

    Raises:
        FileNotFoundError: If root_path does not exist.
        NotADirectoryError: If root_path is not a directory.
        PermissionError: If output_file cannot be written.
    """
    root_path = Path(root_path).resolve()
    if not root_path.exists():
        raise FileNotFoundError(f"Backup path does not exist: {root_path}")
    if not root_path.is_dir():
        raise NotADirectoryError(f"Backup path is not a directory: {root_path}")

    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        label = get_volume_label(root_path)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write("# Backup Index\n\n")
            root_line = f"Root: {root_path}"
            if label:
                root_line += f" (Label: {label})"
            f.write(f"{root_line}\n\n")

            count = 0
            for root, dirs, files in tqdm(os.walk(root_path), desc="Scanning directories"):
                count += 1
                if callback:
                    callback(count, root)
                current_path = Path(root)
                f.write(f"## {current_path}\n\n")

                for d in sorted(dirs):
                    f.write(f"- {current_path / d}/\n")
                for name in sorted(files):
                    file_path = current_path / name
                    try:
                        mtime = os.path.getmtime(file_path)
                        f.write(f"- {file_path} | mtime:{mtime}\n")
                    except Exception:
                        f.write(f"- {file_path}\n")
                f.write("\n")
    except PermissionError as e:
        raise PermissionError(f"Cannot write to output file: {output_path}") from e

semantic_backup_explorer.rag.rag_pipeline

Module for the RAG (Retrieval-Augmented Generation) pipeline.

Classes

RAGPipeline

Orchestrates the retrieval and generation process to answer questions about backups.

Source code in semantic_backup_explorer/rag/rag_pipeline.py
class RAGPipeline:
    """
    Orchestrates the retrieval and generation process to answer questions about backups.
    """

    def __init__(self) -> None:
        """
        Initialize the RAG pipeline with embedder, retriever, and LLM client.

        Raises:
            ImportError: If any semantic dependencies are missing.
        """
        if not HAS_LLM_CLIENT:
            raise ImportError("llm-client is not installed. Please install it with 'pip install -e .[semantic]'")
        self.embedder = Embedder()
        self.retriever = Retriever()
        # Default to groq as requested
        self.client = LLMClient(api_choice="groq")

    def answer_question(self, question: str) -> tuple[str, str]:
        """
        Answers a question using retrieved context from the backup index.

        Args:
            question: The user's question.

        Returns:
            A tuple of (answer_text, context_text).
        """
        # 1. Embed question
        query_embedding = self.embedder.embed_query(question)

        # 2. Retrieve relevant chunks
        results = self.retriever.query(query_embedding, n_results=3)

        documents = results.get("documents")
        if documents and len(documents) > 0:
            doc_list = documents[0]
            if doc_list:
                context = "\n\n".join(doc_list)
            else:
                context = ""
        else:
            context = ""

        # 3. Generate answer
        prompt = f"""
Du bist ein hilfreicher Assistent für die Suche in Backup-Strukturen.
Basierend auf den folgenden Informationen aus dem Backup-Index, beantworte die Frage des Nutzers.
Wenn die Information nicht im Kontext enthalten ist, sage dass du es nicht weißt.

Kontext:
{context}

Frage: {question}

Antwort:"""

        messages = [
            {"role": "system", "content": "Du bist ein Backup-Explorer Assistent."},
            {"role": "user", "content": prompt},
        ]

        response = self.client.chat_completion(messages)
        return response, context
Functions
__init__()

Initialize the RAG pipeline with embedder, retriever, and LLM client.

Raises:

Type Description
ImportError

If any semantic dependencies are missing.

Source code in semantic_backup_explorer/rag/rag_pipeline.py
def __init__(self) -> None:
    """
    Initialize the RAG pipeline with embedder, retriever, and LLM client.

    Raises:
        ImportError: If any semantic dependencies are missing.
    """
    if not HAS_LLM_CLIENT:
        raise ImportError("llm-client is not installed. Please install it with 'pip install -e .[semantic]'")
    self.embedder = Embedder()
    self.retriever = Retriever()
    # Default to groq as requested
    self.client = LLMClient(api_choice="groq")
answer_question(question)

Answers a question using retrieved context from the backup index.

Parameters:

Name Type Description Default
question str

The user's question.

required

Returns:

Type Description
tuple[str, str]

A tuple of (answer_text, context_text).

Source code in semantic_backup_explorer/rag/rag_pipeline.py
    def answer_question(self, question: str) -> tuple[str, str]:
        """
        Answers a question using retrieved context from the backup index.

        Args:
            question: The user's question.

        Returns:
            A tuple of (answer_text, context_text).
        """
        # 1. Embed question
        query_embedding = self.embedder.embed_query(question)

        # 2. Retrieve relevant chunks
        results = self.retriever.query(query_embedding, n_results=3)

        documents = results.get("documents")
        if documents and len(documents) > 0:
            doc_list = documents[0]
            if doc_list:
                context = "\n\n".join(doc_list)
            else:
                context = ""
        else:
            context = ""

        # 3. Generate answer
        prompt = f"""
Du bist ein hilfreicher Assistent für die Suche in Backup-Strukturen.
Basierend auf den folgenden Informationen aus dem Backup-Index, beantworte die Frage des Nutzers.
Wenn die Information nicht im Kontext enthalten ist, sage dass du es nicht weißt.

Kontext:
{context}

Frage: {question}

Antwort:"""

        messages = [
            {"role": "system", "content": "Du bist ein Backup-Explorer Assistent."},
            {"role": "user", "content": prompt},
        ]

        response = self.client.chat_completion(messages)
        return response, context

semantic_backup_explorer.compare.folder_diff

Module for comparing local folders with backup contents.

Classes

FolderDiffResult

Bases: TypedDict

Result of folder comparison.

Source code in semantic_backup_explorer/compare/folder_diff.py
class FolderDiffResult(TypedDict):
    """Result of folder comparison."""

    only_local: list[str]
    only_backup: list[str]
    in_both: list[str]

Functions

compare_folders(local_path, backup_files)

Compares local folder content with backup files.

Files with newer local modification times are included in 'only_local' to trigger sync.

Parameters:

Name Type Description Default
local_path str | Path

Path to the local folder.

required
backup_files Union[list[str], dict[str, float]]

Either a list of relative paths or a dictionary mapping relative paths to modification timestamps.

required

Returns:

Type Description
FolderDiffResult

A TypedDict containing lists of files 'only_local', 'only_backup', and 'in_both'.

Raises:

Type Description
FileNotFoundError

If local_path does not exist.

NotADirectoryError

If local_path is not a directory.

Source code in semantic_backup_explorer/compare/folder_diff.py
def compare_folders(local_path: str | Path, backup_files: Union[list[str], dict[str, float]]) -> FolderDiffResult:
    """
    Compares local folder content with backup files.

    Files with newer local modification times are included in 'only_local' to trigger sync.

    Args:
        local_path: Path to the local folder.
        backup_files: Either a list of relative paths or a dictionary mapping
                     relative paths to modification timestamps.

    Returns:
        A TypedDict containing lists of files 'only_local', 'only_backup', and 'in_both'.

    Raises:
        FileNotFoundError: If local_path does not exist.
        NotADirectoryError: If local_path is not a directory.
    """
    local_path = Path(local_path)
    if not local_path.exists():
        raise FileNotFoundError(f"Local path does not exist: {local_path}")
    if not local_path.is_dir():
        raise NotADirectoryError(f"Local path is not a directory: {local_path}")

    local_files_dict = get_folder_content(local_path)
    local_paths = set(local_files_dict.keys())

    if isinstance(backup_files, dict):
        backup_paths = set(backup_files.keys())
    else:
        backup_paths = set(backup_files)

    only_local = local_paths - backup_paths
    only_backup = backup_paths - local_paths
    in_both = local_paths & backup_paths

    # Check for newer files in local
    newer_locally = set()
    if isinstance(backup_files, dict):
        for path in in_both:
            local_mtime = local_files_dict.get(path, 0.0)
            backup_mtime = backup_files.get(path, 0.0)
            # Use a small epsilon for float comparison (0.1 seconds)
            if local_mtime > backup_mtime + 0.1:
                newer_locally.add(path)

    only_local.update(newer_locally)
    in_both = in_both - newer_locally

    return {"only_local": sorted(list(only_local)), "only_backup": sorted(list(only_backup)), "in_both": sorted(list(in_both))}

get_folder_content(folder_path)

Returns a dictionary of relative file paths and their modification times.

Parameters:

Name Type Description Default
folder_path str | Path

Path to the folder to scan.

required

Returns:

Type Description
dict[str, float]

Dictionary mapping relative file paths to their modification timestamps.

Source code in semantic_backup_explorer/compare/folder_diff.py
def get_folder_content(folder_path: str | Path) -> dict[str, float]:
    """
    Returns a dictionary of relative file paths and their modification times.

    Args:
        folder_path: Path to the folder to scan.

    Returns:
        Dictionary mapping relative file paths to their modification timestamps.
    """
    folder_path = Path(folder_path)
    if not folder_path.exists():
        return {}

    files: dict[str, float] = {}
    for root, _, filenames in os.walk(folder_path):
        for f in filenames:
            full_path = Path(root) / f
            rel_path = str(full_path.relative_to(folder_path))
            try:
                files[rel_path] = os.path.getmtime(full_path)
            except Exception:
                files[rel_path] = 0.0
    return files

semantic_backup_explorer.sync.sync_missing

Module for synchronizing files between local and backup directories.

Classes

SyncProgressCallback

Bases: Protocol

Protocol for sync progress callbacks.

Source code in semantic_backup_explorer/sync/sync_missing.py
class SyncProgressCallback(Protocol):
    """Protocol for sync progress callbacks."""

    def __call__(self, current: int, total: int, filename: str, error: Optional[str] = None) -> None:
        """
        Called for each file processed.

        Args:
            current: Current file number (1-indexed).
            total: Total number of files.
            filename: Relative path of current file.
            error: Error message if sync failed, None if successful.
        """
        ...
Functions
__call__(current, total, filename, error=None)

Called for each file processed.

Parameters:

Name Type Description Default
current int

Current file number (1-indexed).

required
total int

Total number of files.

required
filename str

Relative path of current file.

required
error Optional[str]

Error message if sync failed, None if successful.

None
Source code in semantic_backup_explorer/sync/sync_missing.py
def __call__(self, current: int, total: int, filename: str, error: Optional[str] = None) -> None:
    """
    Called for each file processed.

    Args:
        current: Current file number (1-indexed).
        total: Total number of files.
        filename: Relative path of current file.
        error: Error message if sync failed, None if successful.
    """
    ...

Functions

sync_files(files_to_sync, source_root, target_root, callback=None)

Copies files from source_root to target_root.

Parameters:

Name Type Description Default
files_to_sync list[str]

List of relative file paths to copy.

required
source_root str | Path

Source directory.

required
target_root str | Path

Target directory.

required
callback Optional[SyncProgressCallback]

Optional progress callback.

None

Returns:

Type Description
tuple[list[str], list[tuple[str, str]]]

Tuple of (synced_files, errors) where errors is a list of (filename, error_msg).

Raises:

Type Description
FileNotFoundError

If source_root does not exist.

Source code in semantic_backup_explorer/sync/sync_missing.py
def sync_files(
    files_to_sync: list[str], source_root: str | Path, target_root: str | Path, callback: Optional[SyncProgressCallback] = None
) -> tuple[list[str], list[tuple[str, str]]]:
    """
    Copies files from source_root to target_root.

    Args:
        files_to_sync: List of relative file paths to copy.
        source_root: Source directory.
        target_root: Target directory.
        callback: Optional progress callback.

    Returns:
        Tuple of (synced_files, errors) where errors is a list of (filename, error_msg).

    Raises:
        FileNotFoundError: If source_root does not exist.
    """
    source_root = Path(source_root)
    target_root = Path(target_root)

    if not source_root.exists():
        raise FileNotFoundError(f"Source root does not exist: {source_root}")

    synced = []
    errors = []
    total = len(files_to_sync)

    for i, rel_path in enumerate(files_to_sync):
        src = source_root / rel_path
        dst = target_root / rel_path

        error_msg = None
        try:
            # Create target directory if it doesn't exist
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(src, dst)
            synced.append(rel_path)
        except Exception as e:
            error_msg = str(e)
            errors.append((rel_path, error_msg))

        if callback:
            callback(i + 1, total, rel_path, error_msg)

    return synced, errors

semantic_backup_explorer.utils.config

Centralized configuration for backup operations.

Classes

BackupConfig

Bases: BaseSettings

Central configuration for backup operations.

Loads values from environment variables or a .env file.

Source code in semantic_backup_explorer/utils/config.py
class BackupConfig(BaseSettings):  # type: ignore[misc]
    """
    Central configuration for backup operations.

    Loads values from environment variables or a .env file.
    """

    backup_drive: Path = Path("/media/backup")
    index_path: Path = Path("data/backup_index.md")
    embeddings_path: Path = Path("data/embeddings")
    groq_api_key: str = ""

    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")

    def validate_backup_drive(self) -> None:
        """
        Validate that backup drive exists and is accessible.

        Raises:
            ValueError: If backup drive does not exist.
        """
        if not self.backup_drive.exists():
            raise ValueError(f"Backup drive not found: {self.backup_drive}")
Functions
validate_backup_drive()

Validate that backup drive exists and is accessible.

Raises:

Type Description
ValueError

If backup drive does not exist.

Source code in semantic_backup_explorer/utils/config.py
def validate_backup_drive(self) -> None:
    """
    Validate that backup drive exists and is accessible.

    Raises:
        ValueError: If backup drive does not exist.
    """
    if not self.backup_drive.exists():
        raise ValueError(f"Backup drive not found: {self.backup_drive}")