script3r · script3r · Sep 16, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,17 +1,7 @@
 [workspace]
 members = [
     "crates/scanner-core",
-    "crates/detector-go",
-    "crates/detector-java",
-    "crates/detector-c",
-    "crates/detector-cpp",
-    "crates/detector-rust",
-    "crates/detector-python",
-    "crates/detector-php",
-    "crates/detector-swift",
-    "crates/detector-objc",
-    "crates/detector-kotlin",
-    "crates/detector-erlang",
+    "crates/cbom-generator",
     "crates/cli",
 ]
 resolver = "2"
@@ -26,20 +16,20 @@ repository = "https://example.com/cipherscope/repo"
 
 [workspace.dependencies]
 anyhow = "1"
-thiserror = "1"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 toml = "0.8"
 regex = "1"
 aho-corasick = "1"
-once_cell = "1"
 rayon = "1"
 ignore = "0.4"
-memmap2 = "0.9"
 clap = { version = "4", features = ["derive"] }
-humantime = "2"
 globset = "0.4"
 crossbeam-channel = "0.5"
 walkdir = "2"
 num_cpus = "1"
+uuid = { version = "1", features = ["v4", "v5", "serde"] }
+x509-parser = "0.15"
+chrono = { version = "0.4", features = ["serde"] }
+tempfile = "3"
 
diff --git a/README.md b/README.md
@@ -1,166 +1,100 @@
-## CipherScope
+# CipherScope
 
 <div align="center">
   <img src="cipherscope.png" alt="CipherScope Logo" width="350" height="350">
 </div>
 
-Fast, low-false-positive static scanner that finds third-party cryptographic libraries and call sites across 11 programming languages: Go, Java, C, C++, Rust, Python, PHP, Swift, Objective-C, Kotlin, and Erlang.
+Fast cryptographic inventory generator that creates Minimal Viable Cryptographic Bill of Materials (MV-CBOM) documents. Scans codebases to identify cryptographic algorithms, certificates, and assess post-quantum cryptography readiness.
 
-### Install & Run
+## Quick Start
 
 ```bash
 cargo build --release
-./target/release/cipherscope .
+./target/release/cipherscope --patterns patterns.toml --progress /path/to/scan [... paths]
 ```
 
-JSONL and SARIF:
+## What It Does
 
-```bash
-./target/release/cipherscope . --json > findings.jsonl
-./target/release/cipherscope . --sarif findings.sarif
-```
-
-Key flags:
-- `--threads N`: set thread pool size
-- `--max-file-size MB`: skip large files (default 2)
-- `--patterns PATH`: specify patterns file (default: `patterns.toml`)
-- `--progress`: show progress bar during scanning
-- `--include-glob GLOB` / `--exclude-glob GLOB`
-- `--deterministic`: stable output ordering
-- `--print-config`: print loaded `patterns.toml`
-- `--dry-run`: list files to be scanned
-
-### Output
-
-Pretty table to stdout (default) and optional JSONL/SARIF.
-
-Example table:
-
-```text
-Language | Library | Count | Example
----------|---------|-------|--------
-Rust | RustCrypto | 2 | src/main.rs:12 aes_gcm::Aes256Gcm
-```
+- **Detects** cryptographic usage across 11 languages
+- **Identifies** many cryptographic algorithms (AES, SHA, RSA, ECDSA, ChaCha20, etc.)
+- **Outputs** JSON inventory with NIST quantum security levels
+- **Runs fast** - GiB/s throughput with parallel scanning
 
-JSONL example:
+## Example Output
 
 ```json
-{"language":"Rust","library":"RustCrypto","file":"src/main.rs","span":{"line":12,"column":5},"symbol":"aes_gcm::Aes256Gcm","snippet":"use aes_gcm::Aes256Gcm;","detector_id":"detector-rust"}
+{
+  "bomFormat": "MV-CBOM",
+  "specVersion": "1.0",
+  "cryptoAssets": [{
+    "name": "RSA",
+    "assetProperties": {
+      "primitive": "signature",
+      "parameterSet": {"keySize": 2048},
+      "nistQuantumSecurityLevel": 0
+    }
+  }]
+}
 ```
 
-SARIF snippet:
-
-```json
-{"version":"2.1.0","runs":[{"tool":{"driver":{"name":"cipherscope"}},"results":[{"ruleId":"detector-rust","message":{"text":"RustCrypto in Rust"}}]}]}
-```
-
-### Configuration & Patterns
-
-Patterns are loaded from `patterns.toml` (and optional `patterns.local.toml`, if you add it). The schema supports per-language `include`/`import`/`namespace`/`apis` anchored regexes. The engine strips comments and avoids string literals to reduce false positives.
-
-#### Supported Languages & File Extensions
-
-The scanner automatically detects and processes files with these extensions:
+## Options
 
-- **C/C++**: `.c`, `.h`, `.cc`, `.cpp`, `.cxx`, `.c++`, `.hpp`, `.hxx`, `.h++`, `.hh`
-- **Java**: `.java`
-- **Go**: `.go`
-- **Rust**: `.rs`
-- **Python**: `.py`, `.pyw`, `.pyi`
-- **PHP**: `.php`, `.phtml`, `.php3`, `.php4`, `.php5`, `.phps`
-- **Swift**: `.swift`
-- **Objective-C**: `.m`, `.mm`, `.M`
-- **Kotlin**: `.kt`, `.kts`
-- **Erlang**: `.erl`, `.hrl`, `.beam`
+### Core Options
+- `--patterns PATH` - Custom patterns file (default: `patterns.toml`)
+- `--progress` - Show progress bar during scanning
+- `--deterministic` - Reproducible output for testing/ground-truth generation
+- `--output FILE` - Output file for single-project CBOM (default: stdout)
+- `--recursive` - Generate MV-CBOMs for all discovered projects
+- `--output-dir DIR` - Output directory for recursive CBOMs
 
-#### High-Performance Architecture
+### Filtering & Performance
+- `--threads N` - Number of processing threads
+- `--max-file-size MB` - Maximum file size to scan (default: 2MB)
+- `--include-glob GLOB` - Include files matching glob pattern(s)
+- `--exclude-glob GLOB` - Exclude files matching glob pattern(s)
 
-CipherScope uses a **producer-consumer model** inspired by ripgrep to achieve maximum throughput on large codebases:
+### Certificate Scanning
+- `--skip-certificates` - Skip certificate scanning during CBOM generation
 
-**Producer (Parallel Directory Walker)**:
-- Uses `ignore::WalkParallel` for parallel filesystem traversal
-- Automatically respects `.gitignore` files and skips hidden directories
-- Critical optimization: avoids descending into `node_modules`, `.git`, and other irrelevant directories
-- Language detection happens early to filter files before expensive operations
+### Configuration
+- `--print-config` - Print merged patterns/config and exit
 
-**Consumers (Parallel File Processors)**:
-- Uses `rayon` thread pools for parallel file processing
-- Batched processing (1000 files per batch) for better cache locality
-- Comment stripping and preprocessing shared across all detectors
-- Lockless atomic counters for progress tracking
+## Languages Supported
 
-**Key Optimizations**:
-- **Ultra-fast language detection**: Direct byte comparison, no string allocations
-- **Syscall reduction**: 90% fewer `metadata()` calls through early filtering  
-- **Aho-Corasick prefiltering**: Skip expensive regex matching when no keywords found
-- **Batched channel communication**: Reduces overhead between producer/consumer threads
-- **Optimal thread configuration**: Automatically uses `num_cpus` for directory traversal
+C, C++, Go, Java, Kotlin, Python, Rust, Swift, Objective-C, PHP, Erlang
 
-#### Performance Benchmarks
+## Configuration
 
-**File Discovery Performance**:
-- **5M file directory**: ~20-30 seconds (previously 90+ seconds)
-- **Throughput**: 150,000-250,000 files/second discovery rate
-- **Processing**: 4+ GiB/s content scanning throughput
+Edit `patterns.toml` to add new libraries or algorithms. No code changes needed.
 
-**Scalability**:
-- Linear scaling with CPU cores for file processing
-- Efficient memory usage through batched processing
-- Progress reporting accuracy: 100% (matches `find` command results)
+## How It Works (High-Level)
 
-### Detector Architecture
+1. Workspace discovery and prefilter
+   - Walks files respecting .gitignore
+   - Cheap Aho-Corasick prefilter using language-specific substrings derived from patterns
+2. Language detection and comment stripping
+   - Detects language by extension; strips comments once for fast regex matching
+3. Library identification (anchors)
+   - Per-language detector loads compiled patterns for that language (from `patterns.toml`)
+   - Looks for include/import/namespace/API anchors to confirm a library is present in a file
+4. Algorithm matching
+   - For each identified library, matches algorithm `symbol_patterns` (regex) against the file
+   - Extracts parameters via `parameter_patterns` (e.g., key size, curve) with defaults when absent
+   - Emits findings with file, line/column, library, algorithm, primitive, and NIST quantum level
+5. Deep static analysis (fallback/enrichment)
+   - For small scans, analyzes files directly with the registry to find additional algorithms even if no library finding was produced
+6. CBOM generation
+   - Findings are deduplicated and merged
+   - Final MV-CBOM JSON is printed or written per CLI options
 
-The scanner uses a modular detector architecture with dedicated crates for each language:
+All behavior is driven by `patterns.toml` — adding new libraries/algorithms is a data-only change.
 
-- **detector-c**: C language support
-- **detector-cpp**: C++ language support  
-- **detector-go**: Go language support
-- **detector-java**: Java language support
-- **detector-rust**: Rust language support
-- **detector-python**: Python language support
-- **detector-php**: PHP language support
-- **detector-swift**: Swift language support
-- **detector-objc**: Objective-C language support
-- **detector-kotlin**: Kotlin language support
-- **detector-erlang**: Erlang language support
-
-Each detector implements the `Detector` trait and can be extended independently. To add support for a new language, create a new detector crate under `crates/` or extend the `patterns.toml` to cover additional libraries. See `crates/scanner-core/src/lib.rs` for the trait definition and pattern-driven detector implementation.
-
-### Tests & Benchmarks
-
-Run unit tests and integration tests (fixtures):
+## Testing
 
 ```bash
 cargo test
 ```
 
-Benchmark scan throughput on test fixtures:
-
-```bash
-cargo bench
-```
-
-**Expected benchmark results** (on modern hardware):
-- **Throughput**: ~4.2 GiB/s content processing
-- **File discovery**: 150K-250K files/second  
-- **Memory efficient**: Batched processing prevents memory spikes
-
-**Real-world performance** (5M file Java codebase):
-- **Discovery phase**: 20-30 seconds (down from 90+ seconds)
-- **Processing phase**: Depends on file content and pattern complexity
-- **Progress accuracy**: Exact match with `find` command results
-
-To test progress reporting accuracy on your codebase:
-
-```bash
-# Count files that match your glob patterns
-find /path/to/code -name "*.java" | wc -l
-
-# Run cipherscope with same pattern - numbers should match
-./target/release/cipherscope /path/to/code --include-glob "*.java" --progress
-```
-
-### Contributing
-
-See `CONTRIBUTING.md` for guidelines on adding languages, libraries, and improving performance.
+## License
 
+MIT
diff --git a/crates/cbom-generator/Cargo.toml b/crates/cbom-generator/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "cbom-generator"
+version = "0.1.0"
+edition = "2021"
+license = "Apache-2.0"
+
+[dependencies]
+scanner-core = { path = "../scanner-core" }
+anyhow = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+toml = { workspace = true }
+uuid = { workspace = true }
+x509-parser = { workspace = true }
+chrono = { workspace = true }
+regex = { workspace = true }
+walkdir = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+
+[lib]
+name = "cbom_generator"
+path = "src/lib.rs"