From f0579a7743fc5f38a8643ca3357bd713e30d832c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 19:57:11 +0000
Subject: [PATCH 1/6] Scaffold initial ScribeFlow project structure and docs

---
 .gitignore                             |  27 +++
 .scribeflow/.gitkeep                   |   0
 README.md                              | 248 ++++++++++++++++++++++++-
 archive/completed/.gitkeep             |   0
 archive/failed/.gitkeep                |   0
 config/scribeflow.example.toml         |  22 +++
 docs/ARCHITECTURE.md                   |   3 +
 docs/CONFIGURATION.md                  |   3 +
 inbox/mp3/.gitkeep                     |   0
 inbox/mp4/.gitkeep                     |   0
 logs/.gitkeep                          |   0
 output/markdown/.gitkeep               |   0
 output/raw_json/.gitkeep               |   0
 output/subtitles/.gitkeep              |   0
 pyproject.toml                         |  40 ++++
 scripts/bootstrap.sh                   |   7 +
 src/scribeflow/__init__.py             |   4 +
 src/scribeflow/__main__.py             |   7 +
 src/scribeflow/cli.py                  |  11 ++
 src/scribeflow/commands/__init__.py    |   0
 src/scribeflow/commands/clean.py       |   0
 src/scribeflow/commands/init.py        |   0
 src/scribeflow/commands/process.py     |   0
 src/scribeflow/commands/reprocess.py   |   0
 src/scribeflow/commands/retry.py       |   0
 src/scribeflow/commands/scan.py        |   0
 src/scribeflow/commands/status.py      |   0
 src/scribeflow/core/__init__.py        |   0
 src/scribeflow/core/config.py          |   0
 src/scribeflow/core/ledger.py          |   0
 src/scribeflow/core/models.py          |   0
 src/scribeflow/pipeline/__init__.py    |   0
 src/scribeflow/pipeline/formatter.py   |   0
 src/scribeflow/pipeline/media.py       |   0
 src/scribeflow/pipeline/scanner.py     |   0
 src/scribeflow/pipeline/transcriber.py |   0
 tests/__init__.py                      |   0
 tests/test_smoke.py                    |   7 +
 38 files changed, 378 insertions(+), 1 deletion(-)
 create mode 100644 .gitignore
 create mode 100644 .scribeflow/.gitkeep
 create mode 100644 archive/completed/.gitkeep
 create mode 100644 archive/failed/.gitkeep
 create mode 100644 config/scribeflow.example.toml
 create mode 100644 docs/ARCHITECTURE.md
 create mode 100644 docs/CONFIGURATION.md
 create mode 100644 inbox/mp3/.gitkeep
 create mode 100644 inbox/mp4/.gitkeep
 create mode 100644 logs/.gitkeep
 create mode 100644 output/markdown/.gitkeep
 create mode 100644 output/raw_json/.gitkeep
 create mode 100644 output/subtitles/.gitkeep
 create mode 100644 pyproject.toml
 create mode 100755 scripts/bootstrap.sh
 create mode 100644 src/scribeflow/__init__.py
 create mode 100644 src/scribeflow/__main__.py
 create mode 100644 src/scribeflow/cli.py
 create mode 100644 src/scribeflow/commands/__init__.py
 create mode 100644 src/scribeflow/commands/clean.py
 create mode 100644 src/scribeflow/commands/init.py
 create mode 100644 src/scribeflow/commands/process.py
 create mode 100644 src/scribeflow/commands/reprocess.py
 create mode 100644 src/scribeflow/commands/retry.py
 create mode 100644 src/scribeflow/commands/scan.py
 create mode 100644 src/scribeflow/commands/status.py
 create mode 100644 src/scribeflow/core/__init__.py
 create mode 100644 src/scribeflow/core/config.py
 create mode 100644 src/scribeflow/core/ledger.py
 create mode 100644 src/scribeflow/core/models.py
 create mode 100644 src/scribeflow/pipeline/__init__.py
 create mode 100644 src/scribeflow/pipeline/formatter.py
 create mode 100644 src/scribeflow/pipeline/media.py
 create mode 100644 src/scribeflow/pipeline/scanner.py
 create mode 100644 src/scribeflow/pipeline/transcriber.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_smoke.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fabf5bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Python
+__pycache__/
+*.py[cod]
+*.so
+.pytest_cache/
+
+# Virtual environments
+.venv/
+
+# Local ledger/runtime state
+.scribeflow/*
+!.scribeflow/.gitkeep
+
+# Local outputs
+output/markdown/*
+!output/markdown/.gitkeep
+output/raw_json/*
+!output/raw_json/.gitkeep
+output/subtitles/*
+!output/subtitles/.gitkeep
+logs/*
+!logs/.gitkeep
+
+# OS/editor
+.DS_Store
+.vscode/
+.idea/
diff --git a/.scribeflow/.gitkeep b/.scribeflow/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/README.md b/README.md
index f8a3681..f40ed18 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,248 @@
 # ScribeFlow
-CLI tool for converting MP4 and MP3 lectures into clean, timestamped Markdown transcripts using local speech-to-text.
+
+> Local-first CLI for converting MP4/MP3 lectures into timestamped Markdown transcripts.
+
+## Project Overview
+ScribeFlow is an open-source, local-first transcription workflow for people who want structured notes from recorded audio or video.
+
+It is designed for students, researchers, educators, and builders who need:
+- reproducible transcript generation,
+- clear file tracking,
+- clean Markdown outputs suitable for study, search, and AI/RAG workflows,
+- a privacy-friendly workflow that can run fully on local machines.
+
+## Why ScribeFlow Exists
+Most transcription workflows are fragmented: one tool for conversion, another for transcription, another for cleanup, and no reliable ledger to track what was processed.
+
+ScribeFlow solves this by combining ingestion, normalization, transcription, and Markdown formatting in one CLI workflow with a local SQLite ledger to prevent duplicate work.
+
+## Core Features
+- Local-first processing pipeline for MP4 and MP3 lecture files
+- Inbox-based workflow (`inbox/mp4/` and `inbox/mp3/`)
+- File hashing and deduplication
+- SQLite processing ledger for status tracking
+- FFmpeg-based extraction and normalization
+- `faster-whisper` default speech-to-text backend
+- Raw transcript JSON output for downstream tooling
+- Optional subtitle output (SRT/VTT)
+- Clean timestamped Markdown transcript generation
+- Retry and reprocess support for failed or updated files
+- Rich terminal UX for status and progress reporting
+
+## Example Workflow
+1. Add lecture/video files to `inbox/mp4/`
+2. Add audio files to `inbox/mp3/`
+3. Run `scribeflow scan`
+4. ScribeFlow finds new files and computes content hashes
+5. The SQLite ledger is checked for duplicates and prior status
+6. New items are marked `pending`
+7. Run `scribeflow process` to process all pending items
+8. MP4 files are converted to audio via FFmpeg
+9. MP3 files are normalized to WAV for stable transcription
+10. Speech-to-text runs with `faster-whisper`
+11. Raw transcript JSON is written to `output/raw_json/`
+12. Optional subtitle files are written to `output/subtitles/`
+13. Timestamped Markdown is generated in `output/markdown/`
+14. Ledger status updates to `completed`
+15. (Optional) Original files are moved to `archive/completed/`
+16. Failed jobs are marked `failed` and can be retried with `scribeflow retry`
+
+## Folder Structure
+```text
+ScribeFlow/
+├── archive/
+│   ├── completed/
+│   └── failed/
+├── config/
+│   └── scribeflow.example.toml
+├── docs/
+│   ├── ARCHITECTURE.md
+│   └── CONFIGURATION.md
+├── inbox/
+│   ├── mp3/
+│   └── mp4/
+├── logs/
+├── output/
+│   ├── markdown/
+│   ├── raw_json/
+│   └── subtitles/
+├── scripts/
+│   └── bootstrap.sh
+├── src/
+│   └── scribeflow/
+│       ├── __main__.py
+│       ├── cli.py
+│       ├── commands/
+│       ├── core/
+│       └── pipeline/
+├── tests/
+├── .scribeflow/
+│   └── (sqlite ledger lives here)
+├── pyproject.toml
+├── LICENSE
+└── README.md
+```
+
+## Installation Requirements
+- Python 3.11+
+- FFmpeg available on PATH
+- OS: macOS, Linux, or Windows (WSL recommended on Windows)
+
+## FFmpeg Requirement
+ScribeFlow depends on FFmpeg for media extraction and normalization.
+
+Check installation:
+```bash
+ffmpeg -version
+```
+
+Install examples:
+- macOS (Homebrew): `brew install ffmpeg`
+- Ubuntu/Debian: `sudo apt-get install ffmpeg`
+- Windows (choco): `choco install ffmpeg`
+
+## Python Setup
+```bash
+python3.11 -m venv .venv
+source .venv/bin/activate
+pip install --upgrade pip
+pip install -e .[dev]
+```
+
+## Basic Usage
+```bash
+scribeflow init
+scribeflow scan
+scribeflow process
+scribeflow status
+```
+
+## CLI Commands
+- `scribeflow init` — initialize folders, config, and ledger
+- `scribeflow scan` — scan inbox folders and register new files as pending
+- `scribeflow status` — show processing counts and recent jobs
+- `scribeflow process` — process all pending files end-to-end
+- `scribeflow retry` — retry failed jobs
+- `scribeflow reprocess --file <filename>` — force reprocess one file
+- `scribeflow clean` — clean temporary artifacts and stale intermediate files
+- `scribeflow version` — print installed version
+
+## Configuration
+Configuration is expected to be file-based (TOML/YAML support planned; TOML shown by default).
+
+Suggested config surface:
+- input/output directories
+- archive behavior
+- transcription model + device settings
+- subtitle output toggle (SRT/VTT)
+- hashing and duplicate strategy
+- retries and failure policy
+- logging verbosity
+
+See `/config/scribeflow.example.toml` and `/docs/CONFIGURATION.md`.
+
+## How the SQLite Ledger Works
+ScribeFlow keeps a local SQLite database to persist processing state.
+
+Recommended ledger responsibilities:
+- track canonical file path and content hash
+- track status transitions (`discovered` -> `pending` -> `processing` -> `completed`/`failed`)
+- store attempt count and timestamps
+- record output artifact locations
+- avoid duplicate processing by hash match
+
+Recommended location: `.scribeflow/ledger.db`
+
+## File Status Lifecycle
+Typical lifecycle for each media file:
+1. `discovered`
+2. `pending`
+3. `processing`
+4. `completed` or `failed`
+5. `retrying` (when `scribeflow retry` runs)
+6. back to `processing`, then terminal state
+
+## Markdown Output Format
+Each transcript should be human-readable and machine-parseable:
+- title and source metadata
+- processing timestamp
+- optional model/config metadata
+- timestamped sections/segments
+- clean paragraph formatting
+
+Example pattern:
+- heading with source filename
+- section list with `[hh:mm:ss]` markers
+- normalized punctuation and paragraph grouping
+
+## Example Markdown Transcript
+```markdown
+# Lecture Transcript: Intro_to_Bayesian_Stats.mp4
+
+- Source: inbox/mp4/Intro_to_Bayesian_Stats.mp4
+- Processed: 2026-06-04T14:23:11Z
+- Duration: 00:48:12
+- Model: faster-whisper (medium)
+
+## Transcript
+
+[00:00:03] Welcome everyone. Today we are introducing Bayesian thinking and why prior beliefs matter.
+
+[00:02:41] Let us compare frequentist and Bayesian interpretations using a simple coin toss example.
+
+[00:11:09] The posterior combines prior belief and observed evidence in a mathematically explicit way.
+```
+
+## Roadmap
+Near-term:
+- robust `init/scan/process/status/retry/reprocess/clean` command implementation
+- stable SQLite schema with migrations
+- better failure diagnostics and retry policies
+
+Planned future commands:
+- `scribeflow watch`
+- `scribeflow summarize`
+- `scribeflow quiz`
+- `scribeflow terms`
+- `scribeflow index`
+- `scribeflow search`
+
+Long-term:
+- additional STT backends (`whisper.cpp`, hosted APIs)
+- semantic indexing and retrieval support
+- plugin architecture for custom post-processing
+
+## Development Setup
+```bash
+git clone https://github.com/The-QAI-Lab/ScribeFlow.git
+cd ScribeFlow
+python3.11 -m venv .venv
+source .venv/bin/activate
+pip install -e .[dev]
+```
+
+## Testing
+```bash
+pytest -q
+```
+
+## Contributing
+Contributions are welcome.
+
+Suggested flow:
+1. Fork and create a feature branch
+2. Add tests for behavior changes
+3. Run `pytest`
+4. Open a PR with clear context and examples
+
+Please keep changes focused, documented, and reproducible.
+
+## License Placeholder
+ScribeFlow is currently released under the MIT License (see `/LICENSE`).
+
+If licensing strategy changes before 1.0, this section will be updated with migration guidance.
+
+## Disclaimer
+Transcription quality depends on audio quality, speaker clarity, domain vocabulary, and model selection.
+
+ScribeFlow may produce errors and should be reviewed before use in academic, legal, medical, or business-critical contexts.
diff --git a/archive/completed/.gitkeep b/archive/completed/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/archive/failed/.gitkeep b/archive/failed/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/config/scribeflow.example.toml b/config/scribeflow.example.toml
new file mode 100644
index 0000000..e2a595f
--- /dev/null
+++ b/config/scribeflow.example.toml
@@ -0,0 +1,22 @@
+# ScribeFlow example config (placeholder)
+
+[paths]
+inbox_mp4 = "inbox/mp4"
+inbox_mp3 = "inbox/mp3"
+output_markdown = "output/markdown"
+output_raw_json = "output/raw_json"
+output_subtitles = "output/subtitles"
+archive_completed = "archive/completed"
+
+[transcription]
+backend = "faster-whisper"
+model = "base"
+language = "auto"
+
+[processing]
+write_subtitles = true
+auto_archive_completed = false
+max_retries = 3
+
+[ledger]
+path = ".scribeflow/ledger.db"
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 0000000..3e93d83
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,3 @@
+# ScribeFlow Architecture (Placeholder)
+
+This document will describe module boundaries, processing flow, and extension points.
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
new file mode 100644
index 0000000..9f63172
--- /dev/null
+++ b/docs/CONFIGURATION.md
@@ -0,0 +1,3 @@
+# ScribeFlow Configuration (Placeholder)
+
+This document will define supported keys, defaults, and environment overrides.
diff --git a/inbox/mp3/.gitkeep b/inbox/mp3/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/inbox/mp4/.gitkeep b/inbox/mp4/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/logs/.gitkeep b/logs/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/output/markdown/.gitkeep b/output/markdown/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/output/raw_json/.gitkeep b/output/raw_json/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/output/subtitles/.gitkeep b/output/subtitles/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..9d49f3d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,40 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "scribeflow"
+version = "0.1.0"
+description = "Local-first CLI for converting MP4/MP3 lectures into timestamped Markdown transcripts."
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "MIT" }
+authors = [{ name = "The QAI Lab" }]
+dependencies = [
+  "typer>=0.12",
+  "rich>=13.0",
+  "pydantic>=2.0",
+  "tomli>=2.0; python_version < '3.11'",
+  "PyYAML>=6.0",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0",
+]
+stt = [
+  "faster-whisper>=1.0.0",
+]
+
+[project.scripts]
+scribeflow = "scribeflow.cli:app"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
new file mode 100755
index 0000000..933c55b
--- /dev/null
+++ b/scripts/bootstrap.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python3.11 -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+pip install -e .[dev]
diff --git a/src/scribeflow/__init__.py b/src/scribeflow/__init__.py
new file mode 100644
index 0000000..e2c9d7d
--- /dev/null
+++ b/src/scribeflow/__init__.py
@@ -0,0 +1,4 @@
+"""ScribeFlow package."""
+
+__all__ = ["__version__"]
+__version__ = "0.1.0"
diff --git a/src/scribeflow/__main__.py b/src/scribeflow/__main__.py
new file mode 100644
index 0000000..51fc861
--- /dev/null
+++ b/src/scribeflow/__main__.py
@@ -0,0 +1,7 @@
+"""Module execution entrypoint."""
+
+from scribeflow.cli import app
+
+
+if __name__ == "__main__":
+    app()
diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py
new file mode 100644
index 0000000..6b4ac24
--- /dev/null
+++ b/src/scribeflow/cli.py
@@ -0,0 +1,11 @@
+"""ScribeFlow CLI entrypoint placeholders."""
+
+import typer
+
+app = typer.Typer(help="ScribeFlow CLI")
+
+
+@app.command()
+def version() -> None:
+    """Print version placeholder."""
+    typer.echo("ScribeFlow 0.1.0")
diff --git a/src/scribeflow/commands/__init__.py b/src/scribeflow/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/clean.py b/src/scribeflow/commands/clean.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/init.py b/src/scribeflow/commands/init.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/process.py b/src/scribeflow/commands/process.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/reprocess.py b/src/scribeflow/commands/reprocess.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/retry.py b/src/scribeflow/commands/retry.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/scan.py b/src/scribeflow/commands/scan.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/commands/status.py b/src/scribeflow/commands/status.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/core/__init__.py b/src/scribeflow/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/core/config.py b/src/scribeflow/core/config.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/core/ledger.py b/src/scribeflow/core/ledger.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/core/models.py b/src/scribeflow/core/models.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/pipeline/__init__.py b/src/scribeflow/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/pipeline/formatter.py b/src/scribeflow/pipeline/formatter.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/pipeline/media.py b/src/scribeflow/pipeline/media.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/pipeline/scanner.py b/src/scribeflow/pipeline/scanner.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/scribeflow/pipeline/transcriber.py b/src/scribeflow/pipeline/transcriber.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
new file mode 100644
index 0000000..bb118ca
--- /dev/null
+++ b/tests/test_smoke.py
@@ -0,0 +1,7 @@
+"""Basic smoke test placeholder."""
+
+from scribeflow import __version__
+
+
+def test_version_present() -> None:
+    assert __version__

From c02bfd1fe583e803a294ee1391d02e8723374a65 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 19:57:56 +0000
Subject: [PATCH 2/6] Ignore generated packaging metadata artifacts

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index fabf5bb..ea74cd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,6 @@ logs/*
 .DS_Store
 .vscode/
 .idea/
+
+# Packaging artifacts
+*.egg-info/

From dc067e84d39a1c488e43d445b3099e30befbe833 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 19:58:44 +0000
Subject: [PATCH 3/6] Address review feedback for dependency and version
 handling

---
 pyproject.toml        | 1 -
 src/scribeflow/cli.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9d49f3d..e9768f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
   "typer>=0.12",
   "rich>=13.0",
   "pydantic>=2.0",
-  "tomli>=2.0; python_version < '3.11'",
   "PyYAML>=6.0",
 ]
 
diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py
index 6b4ac24..442e741 100644
--- a/src/scribeflow/cli.py
+++ b/src/scribeflow/cli.py
@@ -2,10 +2,12 @@
 
 import typer
 
+from scribeflow import __version__
+
 app = typer.Typer(help="ScribeFlow CLI")
 
 
 @app.command()
 def version() -> None:
     """Print version placeholder."""
-    typer.echo("ScribeFlow 0.1.0")
+    typer.echo(f"ScribeFlow {__version__}")

From e225b81ecc4a52cda557c3c6af5c0d041453d71c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:06:16 +0000
Subject: [PATCH 4/6] Implement phase 1 CLI ingestion and ledger foundation

---
 README.md                 | 131 ++++++++++++++----------------
 docs/cli.md               |  93 +++++++++++++++++++++
 src/scribeflow/cli.py     |  82 ++++++++++++++++++-
 src/scribeflow/config.py  |  28 +++++++
 src/scribeflow/hashing.py |  18 +++++
 src/scribeflow/ledger.py  | 165 ++++++++++++++++++++++++++++++++++++++
 src/scribeflow/scanner.py |  63 +++++++++++++++
 src/scribeflow/status.py  |  35 ++++++++
 src/scribeflow/utils.py   |  20 +++++
 tests/test_phase1.py      |  98 ++++++++++++++++++++++
 10 files changed, 659 insertions(+), 74 deletions(-)
 create mode 100644 docs/cli.md
 create mode 100644 src/scribeflow/config.py
 create mode 100644 src/scribeflow/hashing.py
 create mode 100644 src/scribeflow/ledger.py
 create mode 100644 src/scribeflow/scanner.py
 create mode 100644 src/scribeflow/status.py
 create mode 100644 src/scribeflow/utils.py
 create mode 100644 tests/test_phase1.py

diff --git a/README.md b/README.md
index f40ed18..5bd1135 100644
--- a/README.md
+++ b/README.md
@@ -16,36 +16,22 @@ Most transcription workflows are fragmented: one tool for conversion, another fo
 
 ScribeFlow solves this by combining ingestion, normalization, transcription, and Markdown formatting in one CLI workflow with a local SQLite ledger to prevent duplicate work.
 
-## Core Features
-- Local-first processing pipeline for MP4 and MP3 lecture files
+## Core Features (Phase 1)
+- Local-first ingestion pipeline for MP4 and MP3 files
 - Inbox-based workflow (`inbox/mp4/` and `inbox/mp3/`)
-- File hashing and deduplication
-- SQLite processing ledger for status tracking
-- FFmpeg-based extraction and normalization
-- `faster-whisper` default speech-to-text backend
-- Raw transcript JSON output for downstream tooling
-- Optional subtitle output (SRT/VTT)
-- Clean timestamped Markdown transcript generation
-- Retry and reprocess support for failed or updated files
-- Rich terminal UX for status and progress reporting
-
-## Example Workflow
-1. Add lecture/video files to `inbox/mp4/`
-2. Add audio files to `inbox/mp3/`
+- SHA-256 file hashing with chunked reads for large files
+- Duplicate prevention by content hash (not filename)
+- SQLite ledger for file tracking and status counts
+- Idempotent workspace initialization (`scribeflow init`)
+- Rich terminal summaries for scan and status commands
+
+## Example Workflow (Current)
+1. Run `scribeflow init` to create workspace folders and ledger
+2. Add media files to `inbox/mp4/` and/or `inbox/mp3/`
 3. Run `scribeflow scan`
-4. ScribeFlow finds new files and computes content hashes
-5. The SQLite ledger is checked for duplicates and prior status
-6. New items are marked `pending`
-7. Run `scribeflow process` to process all pending items
-8. MP4 files are converted to audio via FFmpeg
-9. MP3 files are normalized to WAV for stable transcription
-10. Speech-to-text runs with `faster-whisper`
-11. Raw transcript JSON is written to `output/raw_json/`
-12. Optional subtitle files are written to `output/subtitles/`
-13. Timestamped Markdown is generated in `output/markdown/`
-14. Ledger status updates to `completed`
-15. (Optional) Original files are moved to `archive/completed/`
-16. Failed jobs are marked `failed` and can be retried with `scribeflow retry`
+4. ScribeFlow scans inbox files, hashes content, and skips duplicate hashes
+5. New files are registered in SQLite with status `pending`
+6. Run `scribeflow status` to see totals and pending queue
 
 ## Folder Structure
 ```text
@@ -111,21 +97,23 @@ pip install -e .[dev]
 
 ## Basic Usage
 ```bash
+scribeflow version
 scribeflow init
 scribeflow scan
-scribeflow process
 scribeflow status
 ```
 
 ## CLI Commands
-- `scribeflow init` — initialize folders, config, and ledger
-- `scribeflow scan` — scan inbox folders and register new files as pending
-- `scribeflow status` — show processing counts and recent jobs
-- `scribeflow process` — process all pending files end-to-end
-- `scribeflow retry` — retry failed jobs
-- `scribeflow reprocess --file <filename>` — force reprocess one file
-- `scribeflow clean` — clean temporary artifacts and stale intermediate files
 - `scribeflow version` — print installed version
+- `scribeflow init` — initialize folders and local SQLite ledger
+- `scribeflow scan` — scan inbox folders and register new files as pending
+- `scribeflow status` — show tracked totals and pending files table
+
+Planned later:
+- `scribeflow process`
+- `scribeflow retry`
+- `scribeflow reprocess --file <filename>`
+- `scribeflow clean`
 
 ## Configuration
 Configuration is expected to be file-based (TOML/YAML support planned; TOML shown by default).
@@ -151,46 +139,45 @@ Recommended ledger responsibilities:
 - record output artifact locations
 - avoid duplicate processing by hash match
 
-Recommended location: `.scribeflow/ledger.db`
+Current location: `.scribeflow/ledger.sqlite`
 
 ## File Status Lifecycle
-Typical lifecycle for each media file:
-1. `discovered`
-2. `pending`
-3. `processing`
-4. `completed` or `failed`
-5. `retrying` (when `scribeflow retry` runs)
-6. back to `processing`, then terminal state
-
-## Markdown Output Format
-Each transcript should be human-readable and machine-parseable:
-- title and source metadata
-- processing timestamp
-- optional model/config metadata
-- timestamped sections/segments
-- clean paragraph formatting
-
-Example pattern:
-- heading with source filename
-- section list with `[hh:mm:ss]` markers
-- normalized punctuation and paragraph grouping
+Current statuses implemented in Phase 1:
+1. `pending`
+2. `completed`
+3. `failed`
 
-## Example Markdown Transcript
-```markdown
-# Lecture Transcript: Intro_to_Bayesian_Stats.mp4
+Phase 1 behavior registers new files as `pending` and reports counts by status.
 
-- Source: inbox/mp4/Intro_to_Bayesian_Stats.mp4
-- Processed: 2026-06-04T14:23:11Z
-- Duration: 00:48:12
-- Model: faster-whisper (medium)
-
-## Transcript
-
-[00:00:03] Welcome everyone. Today we are introducing Bayesian thinking and why prior beliefs matter.
+## Markdown Output Format
+Markdown transcript rendering is planned for a later phase.
+The output folders already exist (`output/markdown`, `output/raw_json`, `output/subtitles`) but are not written in Phase 1.
 
-[00:02:41] Let us compare frequentist and Bayesian interpretations using a simple coin toss example.
+## Example Command Output
+```text
+$ scribeflow scan
+            Scan Summary            
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Metric                     ┃ Count ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ Files scanned              │     3 │
+│ New files registered       │     2 │
+│ Duplicates skipped         │     1 │
+│ Unsupported files ignored  │     0 │
+└────────────────────────────┴───────┘
+```
 
-[00:11:09] The posterior combines prior belief and observed evidence in a mathematically explicit way.
+```text
+$ scribeflow status
+         Ledger Status         
+┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Metric                ┃ Count ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ Total files tracked   │     2 │
+│ Pending               │     2 │
+│ Completed             │     0 │
+│ Failed                │     0 │
+└───────────────────────┴───────┘
 ```
 
 ## Roadmap
@@ -200,6 +187,10 @@ Near-term:
 - better failure diagnostics and retry policies
 
 Planned future commands:
+- `scribeflow process`
+- `scribeflow retry`
+- `scribeflow reprocess --file <filename>`
+- `scribeflow clean`
 - `scribeflow watch`
 - `scribeflow summarize`
 - `scribeflow quiz`
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 0000000..6e13e8d
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,93 @@
+# ScribeFlow CLI (Phase 1)
+
+This page documents the currently implemented commands:
+
+- `scribeflow version`
+- `scribeflow init`
+- `scribeflow scan`
+- `scribeflow status`
+
+## `scribeflow version`
+Print the installed version.
+
+```bash
+scribeflow version
+```
+
+Example output:
+```text
+ScribeFlow 0.1.0
+```
+
+## `scribeflow init`
+Creates required workspace folders and initializes the local SQLite ledger.
+
+Creates directories if missing:
+- `inbox/mp4/`
+- `inbox/mp3/`
+- `working/audio/`
+- `working/temp/`
+- `working/logs/`
+- `output/markdown/`
+- `output/raw_json/`
+- `output/subtitles/`
+- `archive/completed/`
+- `archive/failed/`
+- `.scribeflow/`
+
+Creates ledger database:
+- `.scribeflow/ledger.sqlite`
+
+Safe to run multiple times.
+
+```bash
+scribeflow init
+```
+
+Example output:
+```text
+Workspace ready.
+Ledger: .scribeflow/ledger.sqlite
+```
+
+## `scribeflow scan`
+Scans `inbox/mp3/` and `inbox/mp4/`, processes only `.mp3` and `.mp4`, hashes file contents with SHA-256, and registers new files as `pending`.
+
+Duplicate content hashes are skipped even when filenames differ.
+
+```bash
+scribeflow scan
+```
+
+Example output:
+```text
+            Scan Summary            
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Metric                     ┃ Count ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ Files scanned              │     3 │
+│ New files registered       │     2 │
+│ Duplicates skipped         │     1 │
+│ Unsupported files ignored  │     0 │
+└────────────────────────────┴───────┘
+```
+
+## `scribeflow status`
+Shows aggregate ledger counts and a pending-files table.
+
+```bash
+scribeflow status
+```
+
+Example output:
+```text
+         Ledger Status         
+┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Metric                ┃ Count ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ Total files tracked   │     2 │
+│ Pending               │     2 │
+│ Completed             │     0 │
+│ Failed                │     0 │
+└───────────────────────┴───────┘
+```
diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py
index 442e741..134eed5 100644
--- a/src/scribeflow/cli.py
+++ b/src/scribeflow/cli.py
@@ -1,13 +1,87 @@
-"""ScribeFlow CLI entrypoint placeholders."""
+"""ScribeFlow CLI."""
+
+from __future__ import annotations
+
+from pathlib import Path
 
 import typer
+from rich.console import Console
+from rich.table import Table
 
 from scribeflow import __version__
+from scribeflow.config import LEDGER_PATH, REQUIRED_DIRECTORIES
+from scribeflow.ledger import Ledger
+from scribeflow.scanner import scan_workspace
+from scribeflow.status import load_status
+from scribeflow.utils import ensure_directories
 
-app = typer.Typer(help="ScribeFlow CLI")
+app = typer.Typer(help="Local-first CLI for MP3/MP4 transcript tracking.")
+console = Console()
 
 
 @app.command()
 def version() -> None:
-    """Print version placeholder."""
-    typer.echo(f"ScribeFlow {__version__}")
+    """Print installed ScribeFlow version."""
+    console.print(f"ScribeFlow {__version__}")
+
+
+@app.command()
+def init() -> None:
+    """Initialize workspace directories and local ledger database."""
+    root = Path(".")
+    ensure_directories(root, REQUIRED_DIRECTORIES)
+
+    ledger = Ledger(root / LEDGER_PATH)
+    ledger.initialize()
+
+    console.print("[green]Workspace ready.[/green]")
+    console.print(f"Ledger: {(root / LEDGER_PATH).as_posix()}")
+
+
+@app.command()
+def scan() -> None:
+    """Scan inbox folders and register new files."""
+    summary = scan_workspace(Path("."))
+
+    table = Table(title="Scan Summary")
+    table.add_column("Metric")
+    table.add_column("Count", justify="right")
+    table.add_row("Files scanned", str(summary.files_scanned))
+    table.add_row("New files registered", str(summary.new_files_registered))
+    table.add_row("Duplicates skipped", str(summary.duplicates_skipped))
+    table.add_row("Unsupported files ignored", str(summary.unsupported_files_ignored))
+    console.print(table)
+
+
+@app.command()
+def status() -> None:
+    """Show ledger totals and pending files."""
+    snapshot = load_status(Path("."))
+
+    totals = Table(title="Ledger Status")
+    totals.add_column("Metric")
+    totals.add_column("Count", justify="right")
+    totals.add_row("Total files tracked", str(snapshot.total))
+    totals.add_row("Pending", str(snapshot.pending))
+    totals.add_row("Completed", str(snapshot.completed))
+    totals.add_row("Failed", str(snapshot.failed))
+    console.print(totals)
+
+    pending_table = Table(title="Pending Files")
+    pending_table.add_column("Filename")
+    pending_table.add_column("Type")
+    pending_table.add_column("Size", justify="right")
+    pending_table.add_column("Discovered At")
+
+    if snapshot.pending_files:
+        for row in snapshot.pending_files:
+            pending_table.add_row(
+                str(row["original_filename"]),
+                str(row["file_type"]),
+                str(row["file_size"]),
+                str(row["discovered_at"]),
+            )
+    else:
+        pending_table.add_row("-", "-", "-", "-")
+
+    console.print(pending_table)
diff --git a/src/scribeflow/config.py b/src/scribeflow/config.py
new file mode 100644
index 0000000..0337c05
--- /dev/null
+++ b/src/scribeflow/config.py
@@ -0,0 +1,28 @@
+"""Configuration constants for local workspace layout."""
+
+from pathlib import Path
+
+WORKSPACE_ROOT = Path(".")
+
+REQUIRED_DIRECTORIES = [
+    Path("inbox/mp4"),
+    Path("inbox/mp3"),
+    Path("working/audio"),
+    Path("working/temp"),
+    Path("working/logs"),
+    Path("output/markdown"),
+    Path("output/raw_json"),
+    Path("output/subtitles"),
+    Path("archive/completed"),
+    Path("archive/failed"),
+    Path(".scribeflow"),
+]
+
+INBOX_DIRECTORIES = [
+    Path("inbox/mp4"),
+    Path("inbox/mp3"),
+]
+
+LEDGER_PATH = Path(".scribeflow/ledger.sqlite")
+
+SUPPORTED_EXTENSIONS = {".mp3", ".mp4"}
diff --git a/src/scribeflow/hashing.py b/src/scribeflow/hashing.py
new file mode 100644
index 0000000..fe3fc54
--- /dev/null
+++ b/src/scribeflow/hashing.py
@@ -0,0 +1,18 @@
+"""File hashing utilities."""
+
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+
+DEFAULT_CHUNK_SIZE = 1024 * 1024
+
+
+def sha256_file(path: Path, chunk_size: int = DEFAULT_CHUNK_SIZE) -> str:
+    """Compute the SHA-256 hash for a file using chunked reads."""
+    digest = hashlib.sha256()
+    with path.open("rb") as file_handle:
+        while chunk := file_handle.read(chunk_size):
+            digest.update(chunk)
+    return digest.hexdigest()
diff --git a/src/scribeflow/ledger.py b/src/scribeflow/ledger.py
new file mode 100644
index 0000000..784124a
--- /dev/null
+++ b/src/scribeflow/ledger.py
@@ -0,0 +1,165 @@
+"""SQLite ledger access layer."""
+
+from __future__ import annotations
+
+import sqlite3
+from dataclasses import dataclass
+from pathlib import Path
+
+
+ALLOWED_STATUSES = ("pending", "completed", "failed")
+
+
+@dataclass(slots=True)
+class LedgerEntry:
+    source_path: str
+    original_filename: str
+    normalized_filename: str
+    file_type: str
+    file_size: int
+    file_hash: str
+    status: str = "pending"
+    discovered_at: str = ""
+    started_at: str | None = None
+    completed_at: str | None = None
+    output_markdown_path: str | None = None
+    output_json_path: str | None = None
+    output_subtitle_path: str | None = None
+    error_message: str | None = None
+    retry_count: int = 0
+
+
+class Ledger:
+    """Provides read/write operations for the local ledger database."""
+
+    def __init__(self, db_path: Path) -> None:
+        self.db_path = db_path
+
+    def initialize(self) -> None:
+        """Create the ledger table and indexes if missing."""
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        with sqlite3.connect(self.db_path) as connection:
+            connection.execute(
+                """
+                CREATE TABLE IF NOT EXISTS ledger (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    source_path TEXT NOT NULL,
+                    original_filename TEXT NOT NULL,
+                    normalized_filename TEXT NOT NULL,
+                    file_type TEXT NOT NULL,
+                    file_size INTEGER NOT NULL,
+                    file_hash TEXT NOT NULL UNIQUE,
+                    status TEXT NOT NULL CHECK (status IN ('pending', 'completed', 'failed')),
+                    discovered_at TEXT NOT NULL,
+                    started_at TEXT,
+                    completed_at TEXT,
+                    output_markdown_path TEXT,
+                    output_json_path TEXT,
+                    output_subtitle_path TEXT,
+                    error_message TEXT,
+                    retry_count INTEGER NOT NULL DEFAULT 0
+                )
+                """
+            )
+            connection.execute(
+                "CREATE INDEX IF NOT EXISTS idx_ledger_status ON ledger(status)"
+            )
+
+    def register_pending(self, entry: LedgerEntry) -> bool:
+        """Insert a pending entry if hash is new; return True if inserted."""
+        if entry.status not in ALLOWED_STATUSES:
+            raise ValueError(f"Unsupported status: {entry.status}")
+
+        with sqlite3.connect(self.db_path) as connection:
+            cursor = connection.execute(
+                """
+                INSERT INTO ledger (
+                    source_path,
+                    original_filename,
+                    normalized_filename,
+                    file_type,
+                    file_size,
+                    file_hash,
+                    status,
+                    discovered_at,
+                    started_at,
+                    completed_at,
+                    output_markdown_path,
+                    output_json_path,
+                    output_subtitle_path,
+                    error_message,
+                    retry_count
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ON CONFLICT(file_hash) DO NOTHING
+                """,
+                (
+                    entry.source_path,
+                    entry.original_filename,
+                    entry.normalized_filename,
+                    entry.file_type,
+                    entry.file_size,
+                    entry.file_hash,
+                    entry.status,
+                    entry.discovered_at,
+                    entry.started_at,
+                    entry.completed_at,
+                    entry.output_markdown_path,
+                    entry.output_json_path,
+                    entry.output_subtitle_path,
+                    entry.error_message,
+                    entry.retry_count,
+                ),
+            )
+            return cursor.rowcount > 0
+
+    def exists_by_hash(self, file_hash: str) -> bool:
+        """Return True if a file hash is already tracked."""
+        with sqlite3.connect(self.db_path) as connection:
+            row = connection.execute(
+                "SELECT 1 FROM ledger WHERE file_hash = ? LIMIT 1", (file_hash,)
+            ).fetchone()
+            return row is not None
+
+    def counts(self) -> dict[str, int]:
+        """Return total and per-status counts."""
+        with sqlite3.connect(self.db_path) as connection:
+            row = connection.execute(
+                """
+                SELECT
+                    COUNT(*) AS total,
+                    SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending,
+                    SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed,
+                    SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed
+                FROM ledger
+                """
+            ).fetchone()
+
+        return {
+            "total": int(row[0] or 0),
+            "pending": int(row[1] or 0),
+            "completed": int(row[2] or 0),
+            "failed": int(row[3] or 0),
+        }
+
+    def pending_rows(self) -> list[dict[str, str | int]]:
+        """Return pending files for status reporting."""
+        with sqlite3.connect(self.db_path) as connection:
+            connection.row_factory = sqlite3.Row
+            rows = connection.execute(
+                """
+                SELECT original_filename, file_type, file_size, discovered_at
+                FROM ledger
+                WHERE status = 'pending'
+                ORDER BY discovered_at ASC
+                """
+            ).fetchall()
+
+        return [
+            {
+                "original_filename": row["original_filename"],
+                "file_type": row["file_type"],
+                "file_size": int(row["file_size"]),
+                "discovered_at": row["discovered_at"],
+            }
+            for row in rows
+        ]
diff --git a/src/scribeflow/scanner.py b/src/scribeflow/scanner.py
new file mode 100644
index 0000000..f8237b3
--- /dev/null
+++ b/src/scribeflow/scanner.py
@@ -0,0 +1,63 @@
+"""Inbox scanner implementation."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+from scribeflow.config import INBOX_DIRECTORIES, LEDGER_PATH, SUPPORTED_EXTENSIONS
+from scribeflow.hashing import sha256_file
+from scribeflow.ledger import Ledger, LedgerEntry
+from scribeflow.utils import normalize_filename, to_relative_posix
+
+
+@dataclass(slots=True)
+class ScanSummary:
+    files_scanned: int = 0
+    new_files_registered: int = 0
+    duplicates_skipped: int = 0
+    unsupported_files_ignored: int = 0
+
+
+def scan_workspace(root: Path = Path(".")) -> ScanSummary:
+    """Scan inbox folders and register new media files."""
+    ledger = Ledger(root / LEDGER_PATH)
+    ledger.initialize()
+
+    summary = ScanSummary()
+
+    for inbox in INBOX_DIRECTORIES:
+        inbox_path = root / inbox
+        if not inbox_path.exists():
+            continue
+
+        for file_path in sorted(inbox_path.iterdir()):
+            if not file_path.is_file():
+                continue
+
+            summary.files_scanned += 1
+
+            suffix = file_path.suffix.lower()
+            if suffix not in SUPPORTED_EXTENSIONS:
+                summary.unsupported_files_ignored += 1
+                continue
+
+            file_hash = sha256_file(file_path)
+            entry = LedgerEntry(
+                source_path=to_relative_posix(file_path, root),
+                original_filename=file_path.name,
+                normalized_filename=normalize_filename(file_path.name),
+                file_type=suffix.lstrip("."),
+                file_size=file_path.stat().st_size,
+                file_hash=file_hash,
+                status="pending",
+                discovered_at=datetime.now(UTC).isoformat(),
+            )
+
+            if ledger.register_pending(entry):
+                summary.new_files_registered += 1
+            else:
+                summary.duplicates_skipped += 1
+
+    return summary
diff --git a/src/scribeflow/status.py b/src/scribeflow/status.py
new file mode 100644
index 0000000..02ef6f7
--- /dev/null
+++ b/src/scribeflow/status.py
@@ -0,0 +1,35 @@
+"""Status query helpers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from scribeflow.config import LEDGER_PATH
+from scribeflow.ledger import Ledger
+
+
+@dataclass(slots=True)
+class StatusSnapshot:
+    total: int
+    pending: int
+    completed: int
+    failed: int
+    pending_files: list[dict[str, str | int]]
+
+
+def load_status(root: Path = Path(".")) -> StatusSnapshot:
+    """Load aggregate status and pending file details from ledger."""
+    ledger = Ledger(root / LEDGER_PATH)
+    ledger.initialize()
+
+    counts = ledger.counts()
+    pending_files = ledger.pending_rows()
+
+    return StatusSnapshot(
+        total=counts["total"],
+        pending=counts["pending"],
+        completed=counts["completed"],
+        failed=counts["failed"],
+        pending_files=pending_files,
+    )
diff --git a/src/scribeflow/utils.py b/src/scribeflow/utils.py
new file mode 100644
index 0000000..3434a13
--- /dev/null
+++ b/src/scribeflow/utils.py
@@ -0,0 +1,20 @@
+"""General utility helpers for ScribeFlow."""
+
+from pathlib import Path
+
+
+def ensure_directories(root: Path, directories: list[Path]) -> None:
+    """Create required directories if they do not exist."""
+    for relative_dir in directories:
+        (root / relative_dir).mkdir(parents=True, exist_ok=True)
+
+
+def normalize_filename(filename: str) -> str:
+    """Normalize a filename for stable ledger storage."""
+    name = Path(filename).name.strip().lower().replace(" ", "_")
+    return name
+
+
+def to_relative_posix(path: Path, root: Path) -> str:
+    """Return a workspace-relative POSIX path string."""
+    return path.resolve().relative_to(root.resolve()).as_posix()
diff --git a/tests/test_phase1.py b/tests/test_phase1.py
new file mode 100644
index 0000000..0dec3e0
--- /dev/null
+++ b/tests/test_phase1.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from scribeflow.cli import app
+from scribeflow.config import LEDGER_PATH, REQUIRED_DIRECTORIES
+from scribeflow.hashing import sha256_file
+from scribeflow.ledger import Ledger
+from scribeflow.status import load_status
+
+
+runner = CliRunner()
+
+
+def write_file(path: Path, data: bytes) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_bytes(data)
+
+
+def test_init_creates_required_folders_and_ledger(tmp_path: Path, monkeypatch) -> None:
+    monkeypatch.chdir(tmp_path)
+
+    result = runner.invoke(app, ["init"])
+
+    assert result.exit_code == 0
+    for folder in REQUIRED_DIRECTORIES:
+        assert (tmp_path / folder).is_dir()
+    assert (tmp_path / LEDGER_PATH).is_file()
+
+
+def test_hashing_is_consistent_sha256(tmp_path: Path) -> None:
+    file_path = tmp_path / "sample.mp3"
+    payload = b"consistent-bytes-for-hash"
+    file_path.write_bytes(payload)
+
+    expected = hashlib.sha256(payload).hexdigest()
+
+    assert sha256_file(file_path) == expected
+    assert sha256_file(file_path) == expected
+
+
+def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None:
+    monkeypatch.chdir(tmp_path)
+    runner.invoke(app, ["init"])
+
+    write_file(tmp_path / "inbox/mp3/lecture-a.mp3", b"mp3-content-a")
+    write_file(tmp_path / "inbox/mp4/lecture-b.mp4", b"mp4-content-b")
+
+    result = runner.invoke(app, ["scan"])
+
+    assert result.exit_code == 0
+    assert "Files scanned" in result.output
+    assert "New files registered" in result.output
+
+    ledger = Ledger(tmp_path / LEDGER_PATH)
+    counts = ledger.counts()
+    assert counts["total"] == 2
+    assert counts["pending"] == 2
+
+
+def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None:
+    monkeypatch.chdir(tmp_path)
+    runner.invoke(app, ["init"])
+
+    duplicate_payload = b"same-content-different-name"
+    write_file(tmp_path / "inbox/mp3/first.mp3", duplicate_payload)
+    write_file(tmp_path / "inbox/mp4/second.mp4", duplicate_payload)
+
+    first_scan = runner.invoke(app, ["scan"])
+    second_scan = runner.invoke(app, ["scan"])
+
+    assert first_scan.exit_code == 0
+    assert second_scan.exit_code == 0
+
+    ledger = Ledger(tmp_path / LEDGER_PATH)
+    counts = ledger.counts()
+    assert counts["total"] == 1
+    assert counts["pending"] == 1
+
+
+def test_status_reads_ledger_counts(tmp_path: Path, monkeypatch) -> None:
+    monkeypatch.chdir(tmp_path)
+    runner.invoke(app, ["init"])
+
+    write_file(tmp_path / "inbox/mp3/pending.mp3", b"pending-bytes")
+    runner.invoke(app, ["scan"])
+
+    status = load_status(tmp_path)
+
+    assert status.total == 1
+    assert status.pending == 1
+    assert status.completed == 0
+    assert status.failed == 0
+    assert len(status.pending_files) == 1
+    assert status.pending_files[0]["original_filename"] == "pending.mp3"

From f39a614fcd68b0917af5f3eb7bc4277cef47861a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:07:22 +0000
Subject: [PATCH 5/6] Align docs and config with implemented ledger statuses

---
 README.md                      | 2 +-
 config/scribeflow.example.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5bd1135..b6d2601 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,7 @@ ScribeFlow keeps a local SQLite database to persist processing state.
 
 Recommended ledger responsibilities:
 - track canonical file path and content hash
-- track status transitions (`discovered` -> `pending` -> `processing` -> `completed`/`failed`)
+- track current statuses (`pending`, `completed`, `failed`) in Phase 1
 - store attempt count and timestamps
 - record output artifact locations
 - avoid duplicate processing by hash match
diff --git a/config/scribeflow.example.toml b/config/scribeflow.example.toml
index e2a595f..90de4a9 100644
--- a/config/scribeflow.example.toml
+++ b/config/scribeflow.example.toml
@@ -19,4 +19,4 @@ auto_archive_completed = false
 max_retries = 3
 
 [ledger]
-path = ".scribeflow/ledger.db"
+path = ".scribeflow/ledger.sqlite"

From 18f322ea6db8c5b54a9c50ba507d41b3fce6f375 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:08:02 +0000
Subject: [PATCH 6/6] Add pytest MonkeyPatch annotations in phase 1 tests

---
 tests/test_phase1.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/test_phase1.py b/tests/test_phase1.py
index 0dec3e0..d10c894 100644
--- a/tests/test_phase1.py
+++ b/tests/test_phase1.py
@@ -3,6 +3,7 @@
 import hashlib
 from pathlib import Path
 
+import pytest
 from typer.testing import CliRunner
 
 from scribeflow.cli import app
@@ -20,7 +21,9 @@ def write_file(path: Path, data: bytes) -> None:
     path.write_bytes(data)
 
 
-def test_init_creates_required_folders_and_ledger(tmp_path: Path, monkeypatch) -> None:
+def test_init_creates_required_folders_and_ledger(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.chdir(tmp_path)
 
     result = runner.invoke(app, ["init"])
@@ -42,7 +45,9 @@ def test_hashing_is_consistent_sha256(tmp_path: Path) -> None:
     assert sha256_file(file_path) == expected
 
 
-def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None:
+def test_scan_registers_new_mp3_and_mp4(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.chdir(tmp_path)
     runner.invoke(app, ["init"])
 
@@ -61,7 +66,9 @@ def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None:
     assert counts["pending"] == 2
 
 
-def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None:
+def test_scan_skips_duplicates_by_hash(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.chdir(tmp_path)
     runner.invoke(app, ["init"])
 
@@ -81,7 +88,9 @@ def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None:
     assert counts["pending"] == 1
 
 
-def test_status_reads_ledger_counts(tmp_path: Path, monkeypatch) -> None:
+def test_status_reads_ledger_counts(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.chdir(tmp_path)
     runner.invoke(app, ["init"])