From f0579a7743fc5f38a8643ca3357bd713e30d832c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:57:11 +0000 Subject: [PATCH 1/6] Scaffold initial ScribeFlow project structure and docs --- .gitignore | 27 +++ .scribeflow/.gitkeep | 0 README.md | 248 ++++++++++++++++++++++++- archive/completed/.gitkeep | 0 archive/failed/.gitkeep | 0 config/scribeflow.example.toml | 22 +++ docs/ARCHITECTURE.md | 3 + docs/CONFIGURATION.md | 3 + inbox/mp3/.gitkeep | 0 inbox/mp4/.gitkeep | 0 logs/.gitkeep | 0 output/markdown/.gitkeep | 0 output/raw_json/.gitkeep | 0 output/subtitles/.gitkeep | 0 pyproject.toml | 40 ++++ scripts/bootstrap.sh | 7 + src/scribeflow/__init__.py | 4 + src/scribeflow/__main__.py | 7 + src/scribeflow/cli.py | 11 ++ src/scribeflow/commands/__init__.py | 0 src/scribeflow/commands/clean.py | 0 src/scribeflow/commands/init.py | 0 src/scribeflow/commands/process.py | 0 src/scribeflow/commands/reprocess.py | 0 src/scribeflow/commands/retry.py | 0 src/scribeflow/commands/scan.py | 0 src/scribeflow/commands/status.py | 0 src/scribeflow/core/__init__.py | 0 src/scribeflow/core/config.py | 0 src/scribeflow/core/ledger.py | 0 src/scribeflow/core/models.py | 0 src/scribeflow/pipeline/__init__.py | 0 src/scribeflow/pipeline/formatter.py | 0 src/scribeflow/pipeline/media.py | 0 src/scribeflow/pipeline/scanner.py | 0 src/scribeflow/pipeline/transcriber.py | 0 tests/__init__.py | 0 tests/test_smoke.py | 7 + 38 files changed, 378 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 .scribeflow/.gitkeep create mode 100644 archive/completed/.gitkeep create mode 100644 archive/failed/.gitkeep create mode 100644 config/scribeflow.example.toml create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/CONFIGURATION.md create mode 100644 inbox/mp3/.gitkeep create mode 100644 inbox/mp4/.gitkeep create mode 100644 logs/.gitkeep create mode 100644 output/markdown/.gitkeep create mode 100644 output/raw_json/.gitkeep create mode 100644 output/subtitles/.gitkeep create mode 100644 pyproject.toml create mode 100755 scripts/bootstrap.sh create mode 100644 src/scribeflow/__init__.py create mode 100644 src/scribeflow/__main__.py create mode 100644 src/scribeflow/cli.py create mode 100644 src/scribeflow/commands/__init__.py create mode 100644 src/scribeflow/commands/clean.py create mode 100644 src/scribeflow/commands/init.py create mode 100644 src/scribeflow/commands/process.py create mode 100644 src/scribeflow/commands/reprocess.py create mode 100644 src/scribeflow/commands/retry.py create mode 100644 src/scribeflow/commands/scan.py create mode 100644 src/scribeflow/commands/status.py create mode 100644 src/scribeflow/core/__init__.py create mode 100644 src/scribeflow/core/config.py create mode 100644 src/scribeflow/core/ledger.py create mode 100644 src/scribeflow/core/models.py create mode 100644 src/scribeflow/pipeline/__init__.py create mode 100644 src/scribeflow/pipeline/formatter.py create mode 100644 src/scribeflow/pipeline/media.py create mode 100644 src/scribeflow/pipeline/scanner.py create mode 100644 src/scribeflow/pipeline/transcriber.py create mode 100644 tests/__init__.py create mode 100644 tests/test_smoke.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fabf5bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Python +__pycache__/ +*.py[cod] +*.so +.pytest_cache/ + +# Virtual environments +.venv/ + +# Local ledger/runtime state +.scribeflow/* +!.scribeflow/.gitkeep + +# Local outputs +output/markdown/* +!output/markdown/.gitkeep +output/raw_json/* +!output/raw_json/.gitkeep +output/subtitles/* +!output/subtitles/.gitkeep +logs/* +!logs/.gitkeep + +# OS/editor +.DS_Store +.vscode/ +.idea/ diff --git a/.scribeflow/.gitkeep b/.scribeflow/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index f8a3681..f40ed18 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,248 @@ # ScribeFlow -CLI tool for converting MP4 and MP3 lectures into clean, timestamped Markdown transcripts using local speech-to-text. + +> Local-first CLI for converting MP4/MP3 lectures into timestamped Markdown transcripts. + +## Project Overview +ScribeFlow is an open-source, local-first transcription workflow for people who want structured notes from recorded audio or video. + +It is designed for students, researchers, educators, and builders who need: +- reproducible transcript generation, +- clear file tracking, +- clean Markdown outputs suitable for study, search, and AI/RAG workflows, +- a privacy-friendly workflow that can run fully on local machines. + +## Why ScribeFlow Exists +Most transcription workflows are fragmented: one tool for conversion, another for transcription, another for cleanup, and no reliable ledger to track what was processed. + +ScribeFlow solves this by combining ingestion, normalization, transcription, and Markdown formatting in one CLI workflow with a local SQLite ledger to prevent duplicate work. + +## Core Features +- Local-first processing pipeline for MP4 and MP3 lecture files +- Inbox-based workflow (`inbox/mp4/` and `inbox/mp3/`) +- File hashing and deduplication +- SQLite processing ledger for status tracking +- FFmpeg-based extraction and normalization +- `faster-whisper` default speech-to-text backend +- Raw transcript JSON output for downstream tooling +- Optional subtitle output (SRT/VTT) +- Clean timestamped Markdown transcript generation +- Retry and reprocess support for failed or updated files +- Rich terminal UX for status and progress reporting + +## Example Workflow +1. Add lecture/video files to `inbox/mp4/` +2. Add audio files to `inbox/mp3/` +3. Run `scribeflow scan` +4. ScribeFlow finds new files and computes content hashes +5. The SQLite ledger is checked for duplicates and prior status +6. New items are marked `pending` +7. Run `scribeflow process` to process all pending items +8. MP4 files are converted to audio via FFmpeg +9. MP3 files are normalized to WAV for stable transcription +10. Speech-to-text runs with `faster-whisper` +11. Raw transcript JSON is written to `output/raw_json/` +12. Optional subtitle files are written to `output/subtitles/` +13. Timestamped Markdown is generated in `output/markdown/` +14. Ledger status updates to `completed` +15. (Optional) Original files are moved to `archive/completed/` +16. Failed jobs are marked `failed` and can be retried with `scribeflow retry` + +## Folder Structure +```text +ScribeFlow/ +├── archive/ +│ ├── completed/ +│ └── failed/ +├── config/ +│ └── scribeflow.example.toml +├── docs/ +│ ├── ARCHITECTURE.md +│ └── CONFIGURATION.md +├── inbox/ +│ ├── mp3/ +│ └── mp4/ +├── logs/ +├── output/ +│ ├── markdown/ +│ ├── raw_json/ +│ └── subtitles/ +├── scripts/ +│ └── bootstrap.sh +├── src/ +│ └── scribeflow/ +│ ├── __main__.py +│ ├── cli.py +│ ├── commands/ +│ ├── core/ +│ └── pipeline/ +├── tests/ +├── .scribeflow/ +│ └── (sqlite ledger lives here) +├── pyproject.toml +├── LICENSE +└── README.md +``` + +## Installation Requirements +- Python 3.11+ +- FFmpeg available on PATH +- OS: macOS, Linux, or Windows (WSL recommended on Windows) + +## FFmpeg Requirement +ScribeFlow depends on FFmpeg for media extraction and normalization. + +Check installation: +```bash +ffmpeg -version +``` + +Install examples: +- macOS (Homebrew): `brew install ffmpeg` +- Ubuntu/Debian: `sudo apt-get install ffmpeg` +- Windows (choco): `choco install ffmpeg` + +## Python Setup +```bash +python3.11 -m venv .venv +source .venv/bin/activate +pip install --upgrade pip +pip install -e .[dev] +``` + +## Basic Usage +```bash +scribeflow init +scribeflow scan +scribeflow process +scribeflow status +``` + +## CLI Commands +- `scribeflow init` — initialize folders, config, and ledger +- `scribeflow scan` — scan inbox folders and register new files as pending +- `scribeflow status` — show processing counts and recent jobs +- `scribeflow process` — process all pending files end-to-end +- `scribeflow retry` — retry failed jobs +- `scribeflow reprocess --file ` — force reprocess one file +- `scribeflow clean` — clean temporary artifacts and stale intermediate files +- `scribeflow version` — print installed version + +## Configuration +Configuration is expected to be file-based (TOML/YAML support planned; TOML shown by default). + +Suggested config surface: +- input/output directories +- archive behavior +- transcription model + device settings +- subtitle output toggle (SRT/VTT) +- hashing and duplicate strategy +- retries and failure policy +- logging verbosity + +See `/config/scribeflow.example.toml` and `/docs/CONFIGURATION.md`. + +## How the SQLite Ledger Works +ScribeFlow keeps a local SQLite database to persist processing state. + +Recommended ledger responsibilities: +- track canonical file path and content hash +- track status transitions (`discovered` -> `pending` -> `processing` -> `completed`/`failed`) +- store attempt count and timestamps +- record output artifact locations +- avoid duplicate processing by hash match + +Recommended location: `.scribeflow/ledger.db` + +## File Status Lifecycle +Typical lifecycle for each media file: +1. `discovered` +2. `pending` +3. `processing` +4. `completed` or `failed` +5. `retrying` (when `scribeflow retry` runs) +6. back to `processing`, then terminal state + +## Markdown Output Format +Each transcript should be human-readable and machine-parseable: +- title and source metadata +- processing timestamp +- optional model/config metadata +- timestamped sections/segments +- clean paragraph formatting + +Example pattern: +- heading with source filename +- section list with `[hh:mm:ss]` markers +- normalized punctuation and paragraph grouping + +## Example Markdown Transcript +```markdown +# Lecture Transcript: Intro_to_Bayesian_Stats.mp4 + +- Source: inbox/mp4/Intro_to_Bayesian_Stats.mp4 +- Processed: 2026-06-04T14:23:11Z +- Duration: 00:48:12 +- Model: faster-whisper (medium) + +## Transcript + +[00:00:03] Welcome everyone. Today we are introducing Bayesian thinking and why prior beliefs matter. + +[00:02:41] Let us compare frequentist and Bayesian interpretations using a simple coin toss example. + +[00:11:09] The posterior combines prior belief and observed evidence in a mathematically explicit way. +``` + +## Roadmap +Near-term: +- robust `init/scan/process/status/retry/reprocess/clean` command implementation +- stable SQLite schema with migrations +- better failure diagnostics and retry policies + +Planned future commands: +- `scribeflow watch` +- `scribeflow summarize` +- `scribeflow quiz` +- `scribeflow terms` +- `scribeflow index` +- `scribeflow search` + +Long-term: +- additional STT backends (`whisper.cpp`, hosted APIs) +- semantic indexing and retrieval support +- plugin architecture for custom post-processing + +## Development Setup +```bash +git clone https://github.com/The-QAI-Lab/ScribeFlow.git +cd ScribeFlow +python3.11 -m venv .venv +source .venv/bin/activate +pip install -e .[dev] +``` + +## Testing +```bash +pytest -q +``` + +## Contributing +Contributions are welcome. + +Suggested flow: +1. Fork and create a feature branch +2. Add tests for behavior changes +3. Run `pytest` +4. Open a PR with clear context and examples + +Please keep changes focused, documented, and reproducible. + +## License Placeholder +ScribeFlow is currently released under the MIT License (see `/LICENSE`). + +If licensing strategy changes before 1.0, this section will be updated with migration guidance. + +## Disclaimer +Transcription quality depends on audio quality, speaker clarity, domain vocabulary, and model selection. + +ScribeFlow may produce errors and should be reviewed before use in academic, legal, medical, or business-critical contexts. diff --git a/archive/completed/.gitkeep b/archive/completed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/archive/failed/.gitkeep b/archive/failed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/config/scribeflow.example.toml b/config/scribeflow.example.toml new file mode 100644 index 0000000..e2a595f --- /dev/null +++ b/config/scribeflow.example.toml @@ -0,0 +1,22 @@ +# ScribeFlow example config (placeholder) + +[paths] +inbox_mp4 = "inbox/mp4" +inbox_mp3 = "inbox/mp3" +output_markdown = "output/markdown" +output_raw_json = "output/raw_json" +output_subtitles = "output/subtitles" +archive_completed = "archive/completed" + +[transcription] +backend = "faster-whisper" +model = "base" +language = "auto" + +[processing] +write_subtitles = true +auto_archive_completed = false +max_retries = 3 + +[ledger] +path = ".scribeflow/ledger.db" diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..3e93d83 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,3 @@ +# ScribeFlow Architecture (Placeholder) + +This document will describe module boundaries, processing flow, and extension points. diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md new file mode 100644 index 0000000..9f63172 --- /dev/null +++ b/docs/CONFIGURATION.md @@ -0,0 +1,3 @@ +# ScribeFlow Configuration (Placeholder) + +This document will define supported keys, defaults, and environment overrides. diff --git a/inbox/mp3/.gitkeep b/inbox/mp3/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inbox/mp4/.gitkeep b/inbox/mp4/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/output/markdown/.gitkeep b/output/markdown/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/output/raw_json/.gitkeep b/output/raw_json/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/output/subtitles/.gitkeep b/output/subtitles/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9d49f3d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "scribeflow" +version = "0.1.0" +description = "Local-first CLI for converting MP4/MP3 lectures into timestamped Markdown transcripts." +readme = "README.md" +requires-python = ">=3.11" +license = { text = "MIT" } +authors = [{ name = "The QAI Lab" }] +dependencies = [ + "typer>=0.12", + "rich>=13.0", + "pydantic>=2.0", + "tomli>=2.0; python_version < '3.11'", + "PyYAML>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", +] +stt = [ + "faster-whisper>=1.0.0", +] + +[project.scripts] +scribeflow = "scribeflow.cli:app" + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-q" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh new file mode 100755 index 0000000..933c55b --- /dev/null +++ b/scripts/bootstrap.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3.11 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip +pip install -e .[dev] diff --git a/src/scribeflow/__init__.py b/src/scribeflow/__init__.py new file mode 100644 index 0000000..e2c9d7d --- /dev/null +++ b/src/scribeflow/__init__.py @@ -0,0 +1,4 @@ +"""ScribeFlow package.""" + +__all__ = ["__version__"] +__version__ = "0.1.0" diff --git a/src/scribeflow/__main__.py b/src/scribeflow/__main__.py new file mode 100644 index 0000000..51fc861 --- /dev/null +++ b/src/scribeflow/__main__.py @@ -0,0 +1,7 @@ +"""Module execution entrypoint.""" + +from scribeflow.cli import app + + +if __name__ == "__main__": + app() diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py new file mode 100644 index 0000000..6b4ac24 --- /dev/null +++ b/src/scribeflow/cli.py @@ -0,0 +1,11 @@ +"""ScribeFlow CLI entrypoint placeholders.""" + +import typer + +app = typer.Typer(help="ScribeFlow CLI") + + +@app.command() +def version() -> None: + """Print version placeholder.""" + typer.echo("ScribeFlow 0.1.0") diff --git a/src/scribeflow/commands/__init__.py b/src/scribeflow/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/clean.py b/src/scribeflow/commands/clean.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/init.py b/src/scribeflow/commands/init.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/process.py b/src/scribeflow/commands/process.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/reprocess.py b/src/scribeflow/commands/reprocess.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/retry.py b/src/scribeflow/commands/retry.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/scan.py b/src/scribeflow/commands/scan.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/commands/status.py b/src/scribeflow/commands/status.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/core/__init__.py b/src/scribeflow/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/core/config.py b/src/scribeflow/core/config.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/core/ledger.py b/src/scribeflow/core/ledger.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/core/models.py b/src/scribeflow/core/models.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/pipeline/__init__.py b/src/scribeflow/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/pipeline/formatter.py b/src/scribeflow/pipeline/formatter.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/pipeline/media.py b/src/scribeflow/pipeline/media.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/pipeline/scanner.py b/src/scribeflow/pipeline/scanner.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scribeflow/pipeline/transcriber.py b/src/scribeflow/pipeline/transcriber.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 0000000..bb118ca --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,7 @@ +"""Basic smoke test placeholder.""" + +from scribeflow import __version__ + + +def test_version_present() -> None: + assert __version__ From c02bfd1fe583e803a294ee1391d02e8723374a65 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:57:56 +0000 Subject: [PATCH 2/6] Ignore generated packaging metadata artifacts --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index fabf5bb..ea74cd0 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ logs/* .DS_Store .vscode/ .idea/ + +# Packaging artifacts +*.egg-info/ From dc067e84d39a1c488e43d445b3099e30befbe833 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:58:44 +0000 Subject: [PATCH 3/6] Address review feedback for dependency and version handling --- pyproject.toml | 1 - src/scribeflow/cli.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9d49f3d..e9768f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ "typer>=0.12", "rich>=13.0", "pydantic>=2.0", - "tomli>=2.0; python_version < '3.11'", "PyYAML>=6.0", ] diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py index 6b4ac24..442e741 100644 --- a/src/scribeflow/cli.py +++ b/src/scribeflow/cli.py @@ -2,10 +2,12 @@ import typer +from scribeflow import __version__ + app = typer.Typer(help="ScribeFlow CLI") @app.command() def version() -> None: """Print version placeholder.""" - typer.echo("ScribeFlow 0.1.0") + typer.echo(f"ScribeFlow {__version__}") From e225b81ecc4a52cda557c3c6af5c0d041453d71c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:06:16 +0000 Subject: [PATCH 4/6] Implement phase 1 CLI ingestion and ledger foundation --- README.md | 131 ++++++++++++++---------------- docs/cli.md | 93 +++++++++++++++++++++ src/scribeflow/cli.py | 82 ++++++++++++++++++- src/scribeflow/config.py | 28 +++++++ src/scribeflow/hashing.py | 18 +++++ src/scribeflow/ledger.py | 165 ++++++++++++++++++++++++++++++++++++++ src/scribeflow/scanner.py | 63 +++++++++++++++ src/scribeflow/status.py | 35 ++++++++ src/scribeflow/utils.py | 20 +++++ tests/test_phase1.py | 98 ++++++++++++++++++++++ 10 files changed, 659 insertions(+), 74 deletions(-) create mode 100644 docs/cli.md create mode 100644 src/scribeflow/config.py create mode 100644 src/scribeflow/hashing.py create mode 100644 src/scribeflow/ledger.py create mode 100644 src/scribeflow/scanner.py create mode 100644 src/scribeflow/status.py create mode 100644 src/scribeflow/utils.py create mode 100644 tests/test_phase1.py diff --git a/README.md b/README.md index f40ed18..5bd1135 100644 --- a/README.md +++ b/README.md @@ -16,36 +16,22 @@ Most transcription workflows are fragmented: one tool for conversion, another fo ScribeFlow solves this by combining ingestion, normalization, transcription, and Markdown formatting in one CLI workflow with a local SQLite ledger to prevent duplicate work. -## Core Features -- Local-first processing pipeline for MP4 and MP3 lecture files +## Core Features (Phase 1) +- Local-first ingestion pipeline for MP4 and MP3 files - Inbox-based workflow (`inbox/mp4/` and `inbox/mp3/`) -- File hashing and deduplication -- SQLite processing ledger for status tracking -- FFmpeg-based extraction and normalization -- `faster-whisper` default speech-to-text backend -- Raw transcript JSON output for downstream tooling -- Optional subtitle output (SRT/VTT) -- Clean timestamped Markdown transcript generation -- Retry and reprocess support for failed or updated files -- Rich terminal UX for status and progress reporting - -## Example Workflow -1. Add lecture/video files to `inbox/mp4/` -2. Add audio files to `inbox/mp3/` +- SHA-256 file hashing with chunked reads for large files +- Duplicate prevention by content hash (not filename) +- SQLite ledger for file tracking and status counts +- Idempotent workspace initialization (`scribeflow init`) +- Rich terminal summaries for scan and status commands + +## Example Workflow (Current) +1. Run `scribeflow init` to create workspace folders and ledger +2. Add media files to `inbox/mp4/` and/or `inbox/mp3/` 3. Run `scribeflow scan` -4. ScribeFlow finds new files and computes content hashes -5. The SQLite ledger is checked for duplicates and prior status -6. New items are marked `pending` -7. Run `scribeflow process` to process all pending items -8. MP4 files are converted to audio via FFmpeg -9. MP3 files are normalized to WAV for stable transcription -10. Speech-to-text runs with `faster-whisper` -11. Raw transcript JSON is written to `output/raw_json/` -12. Optional subtitle files are written to `output/subtitles/` -13. Timestamped Markdown is generated in `output/markdown/` -14. Ledger status updates to `completed` -15. (Optional) Original files are moved to `archive/completed/` -16. Failed jobs are marked `failed` and can be retried with `scribeflow retry` +4. ScribeFlow scans inbox files, hashes content, and skips duplicate hashes +5. New files are registered in SQLite with status `pending` +6. Run `scribeflow status` to see totals and pending queue ## Folder Structure ```text @@ -111,21 +97,23 @@ pip install -e .[dev] ## Basic Usage ```bash +scribeflow version scribeflow init scribeflow scan -scribeflow process scribeflow status ``` ## CLI Commands -- `scribeflow init` — initialize folders, config, and ledger -- `scribeflow scan` — scan inbox folders and register new files as pending -- `scribeflow status` — show processing counts and recent jobs -- `scribeflow process` — process all pending files end-to-end -- `scribeflow retry` — retry failed jobs -- `scribeflow reprocess --file ` — force reprocess one file -- `scribeflow clean` — clean temporary artifacts and stale intermediate files - `scribeflow version` — print installed version +- `scribeflow init` — initialize folders and local SQLite ledger +- `scribeflow scan` — scan inbox folders and register new files as pending +- `scribeflow status` — show tracked totals and pending files table + +Planned later: +- `scribeflow process` +- `scribeflow retry` +- `scribeflow reprocess --file ` +- `scribeflow clean` ## Configuration Configuration is expected to be file-based (TOML/YAML support planned; TOML shown by default). @@ -151,46 +139,45 @@ Recommended ledger responsibilities: - record output artifact locations - avoid duplicate processing by hash match -Recommended location: `.scribeflow/ledger.db` +Current location: `.scribeflow/ledger.sqlite` ## File Status Lifecycle -Typical lifecycle for each media file: -1. `discovered` -2. `pending` -3. `processing` -4. `completed` or `failed` -5. `retrying` (when `scribeflow retry` runs) -6. back to `processing`, then terminal state - -## Markdown Output Format -Each transcript should be human-readable and machine-parseable: -- title and source metadata -- processing timestamp -- optional model/config metadata -- timestamped sections/segments -- clean paragraph formatting - -Example pattern: -- heading with source filename -- section list with `[hh:mm:ss]` markers -- normalized punctuation and paragraph grouping +Current statuses implemented in Phase 1: +1. `pending` +2. `completed` +3. `failed` -## Example Markdown Transcript -```markdown -# Lecture Transcript: Intro_to_Bayesian_Stats.mp4 +Phase 1 behavior registers new files as `pending` and reports counts by status. -- Source: inbox/mp4/Intro_to_Bayesian_Stats.mp4 -- Processed: 2026-06-04T14:23:11Z -- Duration: 00:48:12 -- Model: faster-whisper (medium) - -## Transcript - -[00:00:03] Welcome everyone. Today we are introducing Bayesian thinking and why prior beliefs matter. +## Markdown Output Format +Markdown transcript rendering is planned for a later phase. +The output folders already exist (`output/markdown`, `output/raw_json`, `output/subtitles`) but are not written in Phase 1. -[00:02:41] Let us compare frequentist and Bayesian interpretations using a simple coin toss example. +## Example Command Output +```text +$ scribeflow scan + Scan Summary +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ +┃ Metric ┃ Count ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ +│ Files scanned │ 3 │ +│ New files registered │ 2 │ +│ Duplicates skipped │ 1 │ +│ Unsupported files ignored │ 0 │ +└────────────────────────────┴───────┘ +``` -[00:11:09] The posterior combines prior belief and observed evidence in a mathematically explicit way. +```text +$ scribeflow status + Ledger Status +┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ +┃ Metric ┃ Count ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ +│ Total files tracked │ 2 │ +│ Pending │ 2 │ +│ Completed │ 0 │ +│ Failed │ 0 │ +└───────────────────────┴───────┘ ``` ## Roadmap @@ -200,6 +187,10 @@ Near-term: - better failure diagnostics and retry policies Planned future commands: +- `scribeflow process` +- `scribeflow retry` +- `scribeflow reprocess --file ` +- `scribeflow clean` - `scribeflow watch` - `scribeflow summarize` - `scribeflow quiz` diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..6e13e8d --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,93 @@ +# ScribeFlow CLI (Phase 1) + +This page documents the currently implemented commands: + +- `scribeflow version` +- `scribeflow init` +- `scribeflow scan` +- `scribeflow status` + +## `scribeflow version` +Print the installed version. + +```bash +scribeflow version +``` + +Example output: +```text +ScribeFlow 0.1.0 +``` + +## `scribeflow init` +Creates required workspace folders and initializes the local SQLite ledger. + +Creates directories if missing: +- `inbox/mp4/` +- `inbox/mp3/` +- `working/audio/` +- `working/temp/` +- `working/logs/` +- `output/markdown/` +- `output/raw_json/` +- `output/subtitles/` +- `archive/completed/` +- `archive/failed/` +- `.scribeflow/` + +Creates ledger database: +- `.scribeflow/ledger.sqlite` + +Safe to run multiple times. + +```bash +scribeflow init +``` + +Example output: +```text +Workspace ready. +Ledger: .scribeflow/ledger.sqlite +``` + +## `scribeflow scan` +Scans `inbox/mp3/` and `inbox/mp4/`, processes only `.mp3` and `.mp4`, hashes file contents with SHA-256, and registers new files as `pending`. + +Duplicate content hashes are skipped even when filenames differ. + +```bash +scribeflow scan +``` + +Example output: +```text + Scan Summary +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ +┃ Metric ┃ Count ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ +│ Files scanned │ 3 │ +│ New files registered │ 2 │ +│ Duplicates skipped │ 1 │ +│ Unsupported files ignored │ 0 │ +└────────────────────────────┴───────┘ +``` + +## `scribeflow status` +Shows aggregate ledger counts and a pending-files table. + +```bash +scribeflow status +``` + +Example output: +```text + Ledger Status +┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ +┃ Metric ┃ Count ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ +│ Total files tracked │ 2 │ +│ Pending │ 2 │ +│ Completed │ 0 │ +│ Failed │ 0 │ +└───────────────────────┴───────┘ +``` diff --git a/src/scribeflow/cli.py b/src/scribeflow/cli.py index 442e741..134eed5 100644 --- a/src/scribeflow/cli.py +++ b/src/scribeflow/cli.py @@ -1,13 +1,87 @@ -"""ScribeFlow CLI entrypoint placeholders.""" +"""ScribeFlow CLI.""" + +from __future__ import annotations + +from pathlib import Path import typer +from rich.console import Console +from rich.table import Table from scribeflow import __version__ +from scribeflow.config import LEDGER_PATH, REQUIRED_DIRECTORIES +from scribeflow.ledger import Ledger +from scribeflow.scanner import scan_workspace +from scribeflow.status import load_status +from scribeflow.utils import ensure_directories -app = typer.Typer(help="ScribeFlow CLI") +app = typer.Typer(help="Local-first CLI for MP3/MP4 transcript tracking.") +console = Console() @app.command() def version() -> None: - """Print version placeholder.""" - typer.echo(f"ScribeFlow {__version__}") + """Print installed ScribeFlow version.""" + console.print(f"ScribeFlow {__version__}") + + +@app.command() +def init() -> None: + """Initialize workspace directories and local ledger database.""" + root = Path(".") + ensure_directories(root, REQUIRED_DIRECTORIES) + + ledger = Ledger(root / LEDGER_PATH) + ledger.initialize() + + console.print("[green]Workspace ready.[/green]") + console.print(f"Ledger: {(root / LEDGER_PATH).as_posix()}") + + +@app.command() +def scan() -> None: + """Scan inbox folders and register new files.""" + summary = scan_workspace(Path(".")) + + table = Table(title="Scan Summary") + table.add_column("Metric") + table.add_column("Count", justify="right") + table.add_row("Files scanned", str(summary.files_scanned)) + table.add_row("New files registered", str(summary.new_files_registered)) + table.add_row("Duplicates skipped", str(summary.duplicates_skipped)) + table.add_row("Unsupported files ignored", str(summary.unsupported_files_ignored)) + console.print(table) + + +@app.command() +def status() -> None: + """Show ledger totals and pending files.""" + snapshot = load_status(Path(".")) + + totals = Table(title="Ledger Status") + totals.add_column("Metric") + totals.add_column("Count", justify="right") + totals.add_row("Total files tracked", str(snapshot.total)) + totals.add_row("Pending", str(snapshot.pending)) + totals.add_row("Completed", str(snapshot.completed)) + totals.add_row("Failed", str(snapshot.failed)) + console.print(totals) + + pending_table = Table(title="Pending Files") + pending_table.add_column("Filename") + pending_table.add_column("Type") + pending_table.add_column("Size", justify="right") + pending_table.add_column("Discovered At") + + if snapshot.pending_files: + for row in snapshot.pending_files: + pending_table.add_row( + str(row["original_filename"]), + str(row["file_type"]), + str(row["file_size"]), + str(row["discovered_at"]), + ) + else: + pending_table.add_row("-", "-", "-", "-") + + console.print(pending_table) diff --git a/src/scribeflow/config.py b/src/scribeflow/config.py new file mode 100644 index 0000000..0337c05 --- /dev/null +++ b/src/scribeflow/config.py @@ -0,0 +1,28 @@ +"""Configuration constants for local workspace layout.""" + +from pathlib import Path + +WORKSPACE_ROOT = Path(".") + +REQUIRED_DIRECTORIES = [ + Path("inbox/mp4"), + Path("inbox/mp3"), + Path("working/audio"), + Path("working/temp"), + Path("working/logs"), + Path("output/markdown"), + Path("output/raw_json"), + Path("output/subtitles"), + Path("archive/completed"), + Path("archive/failed"), + Path(".scribeflow"), +] + +INBOX_DIRECTORIES = [ + Path("inbox/mp4"), + Path("inbox/mp3"), +] + +LEDGER_PATH = Path(".scribeflow/ledger.sqlite") + +SUPPORTED_EXTENSIONS = {".mp3", ".mp4"} diff --git a/src/scribeflow/hashing.py b/src/scribeflow/hashing.py new file mode 100644 index 0000000..fe3fc54 --- /dev/null +++ b/src/scribeflow/hashing.py @@ -0,0 +1,18 @@ +"""File hashing utilities.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path + + +DEFAULT_CHUNK_SIZE = 1024 * 1024 + + +def sha256_file(path: Path, chunk_size: int = DEFAULT_CHUNK_SIZE) -> str: + """Compute the SHA-256 hash for a file using chunked reads.""" + digest = hashlib.sha256() + with path.open("rb") as file_handle: + while chunk := file_handle.read(chunk_size): + digest.update(chunk) + return digest.hexdigest() diff --git a/src/scribeflow/ledger.py b/src/scribeflow/ledger.py new file mode 100644 index 0000000..784124a --- /dev/null +++ b/src/scribeflow/ledger.py @@ -0,0 +1,165 @@ +"""SQLite ledger access layer.""" + +from __future__ import annotations + +import sqlite3 +from dataclasses import dataclass +from pathlib import Path + + +ALLOWED_STATUSES = ("pending", "completed", "failed") + + +@dataclass(slots=True) +class LedgerEntry: + source_path: str + original_filename: str + normalized_filename: str + file_type: str + file_size: int + file_hash: str + status: str = "pending" + discovered_at: str = "" + started_at: str | None = None + completed_at: str | None = None + output_markdown_path: str | None = None + output_json_path: str | None = None + output_subtitle_path: str | None = None + error_message: str | None = None + retry_count: int = 0 + + +class Ledger: + """Provides read/write operations for the local ledger database.""" + + def __init__(self, db_path: Path) -> None: + self.db_path = db_path + + def initialize(self) -> None: + """Create the ledger table and indexes if missing.""" + self.db_path.parent.mkdir(parents=True, exist_ok=True) + with sqlite3.connect(self.db_path) as connection: + connection.execute( + """ + CREATE TABLE IF NOT EXISTS ledger ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_path TEXT NOT NULL, + original_filename TEXT NOT NULL, + normalized_filename TEXT NOT NULL, + file_type TEXT NOT NULL, + file_size INTEGER NOT NULL, + file_hash TEXT NOT NULL UNIQUE, + status TEXT NOT NULL CHECK (status IN ('pending', 'completed', 'failed')), + discovered_at TEXT NOT NULL, + started_at TEXT, + completed_at TEXT, + output_markdown_path TEXT, + output_json_path TEXT, + output_subtitle_path TEXT, + error_message TEXT, + retry_count INTEGER NOT NULL DEFAULT 0 + ) + """ + ) + connection.execute( + "CREATE INDEX IF NOT EXISTS idx_ledger_status ON ledger(status)" + ) + + def register_pending(self, entry: LedgerEntry) -> bool: + """Insert a pending entry if hash is new; return True if inserted.""" + if entry.status not in ALLOWED_STATUSES: + raise ValueError(f"Unsupported status: {entry.status}") + + with sqlite3.connect(self.db_path) as connection: + cursor = connection.execute( + """ + INSERT INTO ledger ( + source_path, + original_filename, + normalized_filename, + file_type, + file_size, + file_hash, + status, + discovered_at, + started_at, + completed_at, + output_markdown_path, + output_json_path, + output_subtitle_path, + error_message, + retry_count + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(file_hash) DO NOTHING + """, + ( + entry.source_path, + entry.original_filename, + entry.normalized_filename, + entry.file_type, + entry.file_size, + entry.file_hash, + entry.status, + entry.discovered_at, + entry.started_at, + entry.completed_at, + entry.output_markdown_path, + entry.output_json_path, + entry.output_subtitle_path, + entry.error_message, + entry.retry_count, + ), + ) + return cursor.rowcount > 0 + + def exists_by_hash(self, file_hash: str) -> bool: + """Return True if a file hash is already tracked.""" + with sqlite3.connect(self.db_path) as connection: + row = connection.execute( + "SELECT 1 FROM ledger WHERE file_hash = ? LIMIT 1", (file_hash,) + ).fetchone() + return row is not None + + def counts(self) -> dict[str, int]: + """Return total and per-status counts.""" + with sqlite3.connect(self.db_path) as connection: + row = connection.execute( + """ + SELECT + COUNT(*) AS total, + SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed + FROM ledger + """ + ).fetchone() + + return { + "total": int(row[0] or 0), + "pending": int(row[1] or 0), + "completed": int(row[2] or 0), + "failed": int(row[3] or 0), + } + + def pending_rows(self) -> list[dict[str, str | int]]: + """Return pending files for status reporting.""" + with sqlite3.connect(self.db_path) as connection: + connection.row_factory = sqlite3.Row + rows = connection.execute( + """ + SELECT original_filename, file_type, file_size, discovered_at + FROM ledger + WHERE status = 'pending' + ORDER BY discovered_at ASC + """ + ).fetchall() + + return [ + { + "original_filename": row["original_filename"], + "file_type": row["file_type"], + "file_size": int(row["file_size"]), + "discovered_at": row["discovered_at"], + } + for row in rows + ] diff --git a/src/scribeflow/scanner.py b/src/scribeflow/scanner.py new file mode 100644 index 0000000..f8237b3 --- /dev/null +++ b/src/scribeflow/scanner.py @@ -0,0 +1,63 @@ +"""Inbox scanner implementation.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path + +from scribeflow.config import INBOX_DIRECTORIES, LEDGER_PATH, SUPPORTED_EXTENSIONS +from scribeflow.hashing import sha256_file +from scribeflow.ledger import Ledger, LedgerEntry +from scribeflow.utils import normalize_filename, to_relative_posix + + +@dataclass(slots=True) +class ScanSummary: + files_scanned: int = 0 + new_files_registered: int = 0 + duplicates_skipped: int = 0 + unsupported_files_ignored: int = 0 + + +def scan_workspace(root: Path = Path(".")) -> ScanSummary: + """Scan inbox folders and register new media files.""" + ledger = Ledger(root / LEDGER_PATH) + ledger.initialize() + + summary = ScanSummary() + + for inbox in INBOX_DIRECTORIES: + inbox_path = root / inbox + if not inbox_path.exists(): + continue + + for file_path in sorted(inbox_path.iterdir()): + if not file_path.is_file(): + continue + + summary.files_scanned += 1 + + suffix = file_path.suffix.lower() + if suffix not in SUPPORTED_EXTENSIONS: + summary.unsupported_files_ignored += 1 + continue + + file_hash = sha256_file(file_path) + entry = LedgerEntry( + source_path=to_relative_posix(file_path, root), + original_filename=file_path.name, + normalized_filename=normalize_filename(file_path.name), + file_type=suffix.lstrip("."), + file_size=file_path.stat().st_size, + file_hash=file_hash, + status="pending", + discovered_at=datetime.now(UTC).isoformat(), + ) + + if ledger.register_pending(entry): + summary.new_files_registered += 1 + else: + summary.duplicates_skipped += 1 + + return summary diff --git a/src/scribeflow/status.py b/src/scribeflow/status.py new file mode 100644 index 0000000..02ef6f7 --- /dev/null +++ b/src/scribeflow/status.py @@ -0,0 +1,35 @@ +"""Status query helpers.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from scribeflow.config import LEDGER_PATH +from scribeflow.ledger import Ledger + + +@dataclass(slots=True) +class StatusSnapshot: + total: int + pending: int + completed: int + failed: int + pending_files: list[dict[str, str | int]] + + +def load_status(root: Path = Path(".")) -> StatusSnapshot: + """Load aggregate status and pending file details from ledger.""" + ledger = Ledger(root / LEDGER_PATH) + ledger.initialize() + + counts = ledger.counts() + pending_files = ledger.pending_rows() + + return StatusSnapshot( + total=counts["total"], + pending=counts["pending"], + completed=counts["completed"], + failed=counts["failed"], + pending_files=pending_files, + ) diff --git a/src/scribeflow/utils.py b/src/scribeflow/utils.py new file mode 100644 index 0000000..3434a13 --- /dev/null +++ b/src/scribeflow/utils.py @@ -0,0 +1,20 @@ +"""General utility helpers for ScribeFlow.""" + +from pathlib import Path + + +def ensure_directories(root: Path, directories: list[Path]) -> None: + """Create required directories if they do not exist.""" + for relative_dir in directories: + (root / relative_dir).mkdir(parents=True, exist_ok=True) + + +def normalize_filename(filename: str) -> str: + """Normalize a filename for stable ledger storage.""" + name = Path(filename).name.strip().lower().replace(" ", "_") + return name + + +def to_relative_posix(path: Path, root: Path) -> str: + """Return a workspace-relative POSIX path string.""" + return path.resolve().relative_to(root.resolve()).as_posix() diff --git a/tests/test_phase1.py b/tests/test_phase1.py new file mode 100644 index 0000000..0dec3e0 --- /dev/null +++ b/tests/test_phase1.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import hashlib +from pathlib import Path + +from typer.testing import CliRunner + +from scribeflow.cli import app +from scribeflow.config import LEDGER_PATH, REQUIRED_DIRECTORIES +from scribeflow.hashing import sha256_file +from scribeflow.ledger import Ledger +from scribeflow.status import load_status + + +runner = CliRunner() + + +def write_file(path: Path, data: bytes) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + + +def test_init_creates_required_folders_and_ledger(tmp_path: Path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + + result = runner.invoke(app, ["init"]) + + assert result.exit_code == 0 + for folder in REQUIRED_DIRECTORIES: + assert (tmp_path / folder).is_dir() + assert (tmp_path / LEDGER_PATH).is_file() + + +def test_hashing_is_consistent_sha256(tmp_path: Path) -> None: + file_path = tmp_path / "sample.mp3" + payload = b"consistent-bytes-for-hash" + file_path.write_bytes(payload) + + expected = hashlib.sha256(payload).hexdigest() + + assert sha256_file(file_path) == expected + assert sha256_file(file_path) == expected + + +def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + runner.invoke(app, ["init"]) + + write_file(tmp_path / "inbox/mp3/lecture-a.mp3", b"mp3-content-a") + write_file(tmp_path / "inbox/mp4/lecture-b.mp4", b"mp4-content-b") + + result = runner.invoke(app, ["scan"]) + + assert result.exit_code == 0 + assert "Files scanned" in result.output + assert "New files registered" in result.output + + ledger = Ledger(tmp_path / LEDGER_PATH) + counts = ledger.counts() + assert counts["total"] == 2 + assert counts["pending"] == 2 + + +def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + runner.invoke(app, ["init"]) + + duplicate_payload = b"same-content-different-name" + write_file(tmp_path / "inbox/mp3/first.mp3", duplicate_payload) + write_file(tmp_path / "inbox/mp4/second.mp4", duplicate_payload) + + first_scan = runner.invoke(app, ["scan"]) + second_scan = runner.invoke(app, ["scan"]) + + assert first_scan.exit_code == 0 + assert second_scan.exit_code == 0 + + ledger = Ledger(tmp_path / LEDGER_PATH) + counts = ledger.counts() + assert counts["total"] == 1 + assert counts["pending"] == 1 + + +def test_status_reads_ledger_counts(tmp_path: Path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + runner.invoke(app, ["init"]) + + write_file(tmp_path / "inbox/mp3/pending.mp3", b"pending-bytes") + runner.invoke(app, ["scan"]) + + status = load_status(tmp_path) + + assert status.total == 1 + assert status.pending == 1 + assert status.completed == 0 + assert status.failed == 0 + assert len(status.pending_files) == 1 + assert status.pending_files[0]["original_filename"] == "pending.mp3" From f39a614fcd68b0917af5f3eb7bc4277cef47861a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:07:22 +0000 Subject: [PATCH 5/6] Align docs and config with implemented ledger statuses --- README.md | 2 +- config/scribeflow.example.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5bd1135..b6d2601 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ ScribeFlow keeps a local SQLite database to persist processing state. Recommended ledger responsibilities: - track canonical file path and content hash -- track status transitions (`discovered` -> `pending` -> `processing` -> `completed`/`failed`) +- track current statuses (`pending`, `completed`, `failed`) in Phase 1 - store attempt count and timestamps - record output artifact locations - avoid duplicate processing by hash match diff --git a/config/scribeflow.example.toml b/config/scribeflow.example.toml index e2a595f..90de4a9 100644 --- a/config/scribeflow.example.toml +++ b/config/scribeflow.example.toml @@ -19,4 +19,4 @@ auto_archive_completed = false max_retries = 3 [ledger] -path = ".scribeflow/ledger.db" +path = ".scribeflow/ledger.sqlite" From 18f322ea6db8c5b54a9c50ba507d41b3fce6f375 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:08:02 +0000 Subject: [PATCH 6/6] Add pytest MonkeyPatch annotations in phase 1 tests --- tests/test_phase1.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/test_phase1.py b/tests/test_phase1.py index 0dec3e0..d10c894 100644 --- a/tests/test_phase1.py +++ b/tests/test_phase1.py @@ -3,6 +3,7 @@ import hashlib from pathlib import Path +import pytest from typer.testing import CliRunner from scribeflow.cli import app @@ -20,7 +21,9 @@ def write_file(path: Path, data: bytes) -> None: path.write_bytes(data) -def test_init_creates_required_folders_and_ledger(tmp_path: Path, monkeypatch) -> None: +def test_init_creates_required_folders_and_ledger( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.chdir(tmp_path) result = runner.invoke(app, ["init"]) @@ -42,7 +45,9 @@ def test_hashing_is_consistent_sha256(tmp_path: Path) -> None: assert sha256_file(file_path) == expected -def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None: +def test_scan_registers_new_mp3_and_mp4( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.chdir(tmp_path) runner.invoke(app, ["init"]) @@ -61,7 +66,9 @@ def test_scan_registers_new_mp3_and_mp4(tmp_path: Path, monkeypatch) -> None: assert counts["pending"] == 2 -def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None: +def test_scan_skips_duplicates_by_hash( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.chdir(tmp_path) runner.invoke(app, ["init"]) @@ -81,7 +88,9 @@ def test_scan_skips_duplicates_by_hash(tmp_path: Path, monkeypatch) -> None: assert counts["pending"] == 1 -def test_status_reads_ledger_counts(tmp_path: Path, monkeypatch) -> None: +def test_status_reads_ledger_counts( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.chdir(tmp_path) runner.invoke(app, ["init"])