feat: transparently append to compressed archives

When appending to a compressed archive (gzip, brotli, zstd), the tool
now handles compression automatically. Since some compression formats don't
support appending to compressed files in place, we write a new
compressed file with all the data and atomically rename it to replace
the original (assuming there is enough space on that filesystem).

This means you can work with compressed archives the same way as
uncompressed ones. Point the tool at your .json.gz file and append
values. No manual decompression/recompression needed.
This commit is contained in:
nobody 2025-11-30 17:09:44 -08:00
commit 2ab1c31993
Signed by: GrocerPublishAgent
GPG key ID: 43B1C298CDDE181C
34 changed files with 4747 additions and 1099 deletions

4
.gitignore vendored
View file

@ -1,2 +1,6 @@
/target
.redo/
docs/demo/v1.json.archive
tests/compression-integration/data/
tests/compression-integration/out/

View file

@ -1,6 +1,37 @@
## Running Tests
### Unit tests
```bash
cargo test # Run without compression support
cargo test --features compression # Run with compression support
```
### Integration tests
The compression integration tests verify appending to compressed archives (gzip, brotli, zstd):
```bash
cd tests/compression-integration
./run_all.sh # Run all compression tests
./run_gzip_test.sh # Run only gzip test
./run_brotli_test.sh # Run only brotli test
./run_zstd_test.sh # Run only zstd test
```
These tests:
1. Create an uncompressed archive from the first state file
2. Compress it with the respective tool (gzip/brotli/zstd)
3. Append additional state files to the compressed archive
4. Verify the archive can be read and shows the correct observation count
Requirements: Python 3 (for test data generation), gzip, brotli, zstd command-line tools.
### Fuzz testing
To begin fuzzing, run: `cargo fuzz run <fuzz target name>`
The source code for a fuzz target by default lives in `fuzz/fuzz_targets/<fuzz target name>.rs`.
The source code for a fuzz target by default lives in `fuzz/fuzz_targets/<fuzz target name>.rs`.
Each fuzz target is a Rust program that is given random data and tests a crate (in this case, json-archive). Use `cargo fuzz list` to view the list of all existing fuzz targets:

140
Cargo.lock generated
View file

@ -49,9 +49,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bitflags"
version = "2.9.4"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
[[package]]
name = "brotli"
@ -82,9 +82,9 @@ checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
[[package]]
name = "cc"
version = "1.2.39"
version = "1.2.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1354349954c6fc9cb0deab020f27f783cf0b604e8bb754dc4658ecf0d29c35f"
checksum = "c481bdbf0ed3b892f6f806287d72acd515b352a4ec27a208489b8c1bc839633a"
dependencies = [
"find-msvc-tools",
"jobserver",
@ -94,9 +94,9 @@ dependencies = [
[[package]]
name = "cfg-if"
version = "1.0.3"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "chrono"
@ -156,15 +156,15 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "find-msvc-tools"
version = "0.1.2"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
[[package]]
name = "flate2"
version = "1.1.2"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d"
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
dependencies = [
"crc32fast",
"miniz_oxide",
@ -172,14 +172,14 @@ dependencies = [
[[package]]
name = "getrandom"
version = "0.3.3"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi",
"wasip2",
]
[[package]]
@ -224,9 +224,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.81"
version = "0.3.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8"
dependencies = [
"once_cell",
"wasm-bindgen",
@ -250,9 +250,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.176"
version = "0.2.177"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174"
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
[[package]]
name = "linux-raw-sys"
@ -279,6 +279,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
"simd-adler32",
]
[[package]]
@ -304,18 +305,18 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "proc-macro2"
version = "1.0.101"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
version = "1.0.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
dependencies = [
"proc-macro2",
]
@ -353,9 +354,9 @@ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
version = "1.0.227"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80ece43fc6fbed4eb5392ab50c07334d3e577cbf40997ee896fe7af40bba4245"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
@ -363,18 +364,18 @@ dependencies = [
[[package]]
name = "serde_core"
version = "1.0.227"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a576275b607a2c86ea29e410193df32bc680303c82f31e275bbfcafe8b33be5"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.227"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51e694923b8824cf0e9b382adf0f60d4e05f348f357b38833a3fa5ed7c2ede04"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
@ -401,10 +402,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "syn"
version = "2.0.106"
name = "simd-adler32"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]]
name = "syn"
version = "2.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87"
dependencies = [
"proc-macro2",
"quote",
@ -426,9 +433,9 @@ dependencies = [
[[package]]
name = "unicode-ident"
version = "1.0.19"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "uuid"
@ -442,15 +449,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "wasi"
version = "0.14.7+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
dependencies = [
"wasip2",
]
[[package]]
name = "wasip2"
version = "1.0.1+wasi-0.2.4"
@ -462,9 +460,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
version = "0.2.104"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd"
dependencies = [
"cfg-if",
"once_cell",
@ -473,25 +471,11 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.104"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -499,31 +483,31 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.104"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40"
dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.104"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4"
dependencies = [
"unicode-ident",
]
[[package]]
name = "windows-core"
version = "0.62.1"
version = "0.62.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6844ee5416b285084d3d3fffd743b925a6c9385455f64f6d4fa3031c4c2749a9"
checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
dependencies = [
"windows-implement",
"windows-interface",
@ -534,9 +518,9 @@ dependencies = [
[[package]]
name = "windows-implement"
version = "0.60.1"
version = "0.60.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edb307e42a74fb6de9bf3a02d9712678b22399c87e6fa869d6dfcd8c1b7754e0"
checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
dependencies = [
"proc-macro2",
"quote",
@ -545,9 +529,9 @@ dependencies = [
[[package]]
name = "windows-interface"
version = "0.59.2"
version = "0.59.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0abd1ddbc6964ac14db11c7213d6532ef34bd9aa042c2e5935f59d7908b46a5"
checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
dependencies = [
"proc-macro2",
"quote",
@ -556,33 +540,33 @@ dependencies = [
[[package]]
name = "windows-link"
version = "0.2.0"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-result"
version = "0.4.0"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f"
checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-strings"
version = "0.5.0"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda"
checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-sys"
version = "0.61.1"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]

View file

@ -2,11 +2,11 @@
A practical CLI tool for tracking JSON file changes over time. Instead of keeping multiple copies of JSON files, this creates compact delta-based archives that preserve the complete history.
## What it does
## Why I made this tool
This tool solves a simple problem: you have a JSON file that changes regularly, and you want to track its history without storing dozens of full copies.
The problem I am solving: I have a JSON file that changes regularly (output of a scraping pipeline), and I want to track its history without storing dozens of full copies.
`json-archive` creates a `.json.archive` file next to your original JSON file. Each time you run the tool, it calculates only what changed and appends those deltas to the archive. You get complete history with minimal storage overhead.
`json-archive` creates a `.json.archive` file next to your original JSON file. Each time you run the tool, it calculates only what changed and appends those deltas to the archive. You get complete history with minimal storage overhead. It can move a .json file into the archive or leave it untouched.
The archive format is human-readable JSONL (not binary), making it easy to inspect, debug, and pipe into other scripts or web visualizations.
@ -18,9 +18,6 @@ json-archive data.json
# Later, append changes to existing archive
json-archive data.json.archive data.json
# Or let it infer again (won't overwrite without --force)
json-archive data.json # Safe: won't overwrite existing data.json.archive
```
## Real-world use case
@ -63,7 +60,7 @@ While the core design keeps things simple and readable, the tool does work with
This works fine for the happy path with archive files up to a few hundred megabytes, but contradicts the "keep it simple" design philosophy - it's included because it's practically useful.
**Building without compression**: Compression libraries are a security vulnerability vector. The default build includes them because most users want convenience. If you don't want to bundle compression libraries:
**Building without compression**: Compression libraries are a security vulnerability vector. The default build includes them because I want convenience. If you don't want to bundle compression libraries:
```bash
cargo install json-archive --no-default-features
@ -157,38 +154,11 @@ cargo build --release
Archives use the `.json.archive` extension by default:
- `data.json` -> `data.json.archive`
- `video.info.json` -> `video.info.json.archive`
- `config.json` -> `config.json.archive`
This makes it immediately clear which files are archives and which are source files.
## Error handling
The tool uses descriptive diagnostics instead of cryptic error codes:
```
error: I couldn't find the input file: missing.json
|
= help: Make sure the file path is correct and the file exists.
Check for typos in the filename.
```
Diagnostics are categorized as Fatal, Warning, or Info, and the tool exits with non-zero status only for fatal errors.
## Performance characteristics
- **Memory usage**: Bounded by largest single JSON file, not archive size
- **Append speed**: Fast - only computes deltas, doesn't re-read entire archive
- **Read speed**: Linear scan, but snapshots allow seeking to recent state
- **File size**: Typically 10-30% the size of storing all JSON copies
For very large archives, consider using snapshots (`-s` flag) to enable faster seeking.
- `<filename>.json` -> `<filename>.json.archive`
## Browser compatibility
Archives can be loaded directly in web applications:
The strength of the file format is easy browser visualization:
```javascript
// Parse archive in browser
@ -205,7 +175,8 @@ fetch('data.json.archive')
});
```
The format uses only standard JSON. No special parsing required.
The format uses only standard JSON and organizes the data into roughly the shape
you would need anyway.
## Contributing
@ -227,10 +198,10 @@ This project is licensed under the GNU Affero General Public License v3.0 (AGPL-
- You can use, modify, and distribute this software
- If you modify and distribute it, you must share your changes under the same license
- If you run a modified version on a server or embed it in a larger system, you must make the entire system's source code available to users
- No TiVoization - hardware restrictions that prevent users from running modified versions are prohibited
- No TiVoization! Hardware restrictions that prevent users from running
modified versions are prohibited. If you have a setup where you hard code a
signing key into firmware and refuse to run any user modified programs signed
by your secret key... then you are not allowed to use this software.
The AGPL ensures that improvements to this tool remain open and available to everyone, even when used in hosted services or embedded systems.
---
*Built with Rust for reliability and performance. Designed to be simple enough to understand, powerful enough to be useful.*
The AGPL ensures that improvements to this tool remain open and available to
everyone, even when used in hosted services or embedded systems.

View file

@ -5,5 +5,13 @@ case $1 in
addlicense -c "Peoples Grocers LLC" -f LICENSE-header -l "agpl-3.0" -s src/ >&2
;;
docs/diagnostics/json-pointer.md)
redo-ifchange src/bin/pointer_errors_demo.rs src/pointer.rs src/pointer_errors.rs
cargo run --quiet --bin pointer_errors_demo
;;
gen)
redo docs/diagnostics/json-pointer.md
;;
esac

View file

@ -0,0 +1,141 @@
<!-- Generated by: cargo run --bin pointer_errors_demo > docs/diagnostics/json-pointer.md -->
# JSON Pointer Diagnostics
These are the error messages you'll see when a [JSON Pointer (RFC 6901)](https://datatracker.ietf.org/doc/html/rfc6901)
operation fails.
## Why These Errors Are Limited
The JSON object that failed to index probably doesn't exist anywhere as a file. It's
built by replaying delta events from the archive. The filename and line numbers in
these errors point to the source of the JSON pointer paths—the add/change/remove
events in the archive—not to the object itself.
A proper solution would dump the reconstructed JSON object to a file so you could
inspect it with `jq` or a text editor. That engineering work didn't happen.
Instead, you get:
- The pointer path that failed, with the failing segment underlined
- The actual value at the parent path (truncated)
- Some strings you can grep for in the archive
This is better than nothing, but it's still awkward. You can see *what* failed but
not easily inspect the full object we tried to index into. If you're lucky, the
truncated value shown is enough. If you're developing on this project, at least
you know what the errors look like.
## Contributing
If an error message is confusing or unhelpful for your case, please open an issue
or submit a pull request.
## Key Not Found
Key doesn't exist in the object. Shows available keys and suggests typos.
```
error E051: Path not found
I was traversing the JSON path '/user/emial' and got stuck.
I couldn't find the key 'emial'.
/user/emial
^^^^^
Value at '/user':
│ "age": ...
│ "email": ...
│ "name": ...
Available keys: age, email, name
Did you mean 'email'?
```
## Type Mismatch
Tried to index into a value that doesn't support it (e.g., `/domain` on a string,
`/0` on a number). Shows the actual type.
```
error E060: Type mismatch
I was traversing the JSON path '/users/0/email/domain' and got stuck.
I can't index into string with 'domain'.
/users/0/email/domain
^^^^^^
Value at '/users/0/email':
│ "alice@example.com"
Object keys like '/domain' only work on objects, not string.
```
## Array Index Out of Bounds
Index past the end of the array. Shows the array length.
```
error E051: Path not found
I was traversing the JSON path '/items/5' and got stuck.
I couldn't find index 5 (array length is 3).
/items/5
^
Value at '/items':
│ 0: "apple"
│ 1: "banana"
│ 2: "cherry"
Valid indices are 0-2.
```
## Array Index
If you think you have an object but you're actually indexing into an array, you'll see this error.
```
error E052: Invalid array index
I was traversing the JSON path '/items/foo' and got stuck.
I couldn't parse 'foo' as an array index.
/items/foo
^^^
Value at '/items':
│ 0: "apple"
│ 1: "banana"
│ 2: "cherry"
Array indices must be non-negative integers. Got 'foo'.
```
## Deep Path Failures
For long paths, the underline shows which segment failed. The full path remains
visible so you can see what you were trying to reach.
```
error E051: Path not found
I was traversing the JSON path '/data/users/0/profile/settings/theme' and got stuck.
I couldn't find the key 'settings'.
/data/users/0/profile/settings/theme
^^^^^^^^
Value at '/data/users/0/profile':
│ "name": ...
Available keys: name
```

49
docs/fuzz-testing.md Normal file
View file

@ -0,0 +1,49 @@
# Fuzz Testing
Fuzz testing throws random inputs at your code until something breaks.
## Commands
List available fuzz targets:
```
cargo fuzz list
```
Run a fuzz target:
```
cargo fuzz run fuzz_apply_move
```
Runs until you kill it or it finds a crash.
## Reading the Output
```
#787958 REDUCE cov: 1281 ft: 6423 corp: 1112/621Kb lim: 4096 exec/s: 13823 rss: 584Mb L: 19/3954 MS: 1 EraseBytes-
#788755 REDUCE cov: 1281 ft: 6424 corp: 1113/621Kb lim: 4096 exec/s: 13837 rss: 584Mb L: 767/3954 MS: 2 CMP-CrossOver- DE: "6\000\000\000"-
#789383 REDUCE cov: 1281 ft: 6424 corp: 1113/621Kb lim: 4096 exec/s: 13848 rss: 584Mb L: 59/3954 MS: 3 InsertByte-ShuffleBytes-EraseBytes-
```
The fields:
- `#787958` — test case number. How many inputs have been tried.
- `REDUCE` — what happened. `NEW` means new code was reached. `REDUCE` means an input was shrunk while keeping the same coverage. `pulse` is just a heartbeat.
- `cov: 1281` — coverage. Number of code edges hit. This is what you care about.
- `ft: 6423` — features. Finer-grained coverage metric. Ignore it.
- `corp: 1112/621Kb` — corpus. 1112 interesting inputs saved, 621KB total.
- `exec/s: 13823` — speed. Test cases per second.
- `rss: 584Mb` — memory use.
- `L: 19/3954` — input length. This one was 19 bytes. Largest in corpus is 3954.
- `MS: 1 EraseBytes-` — mutation. How the input was generated. Doesn't matter.
## Is It Working?
Watch `cov`. If it goes up, the fuzzer is finding new code paths. If it stops going up, either you have good coverage or the fuzzer is stuck.
`exec/s` in the thousands is fine. If it drops to double digits, something is wrong.
Seeing `NEW` events means progress. Long stretches without `NEW` means diminishing returns.
## When to Stop
When `cov` stops increasing and you're bored. Hours for a quick check, days for thoroughness.

116
fuzz/Cargo.lock generated
View file

@ -2,6 +2,27 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "adler2"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "alloc-no-stdlib"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
[[package]]
name = "alloc-stdlib"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "android_system_properties"
version = "0.1.5"
@ -32,6 +53,27 @@ version = "2.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
[[package]]
name = "brotli"
version = "8.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
name = "brotli-decompressor"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
]
[[package]]
name = "bumpalo"
version = "3.19.0"
@ -76,6 +118,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crc32fast"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [
"cfg-if",
]
[[package]]
name = "derive_arbitrary"
version = "1.4.2"
@ -109,6 +160,16 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
[[package]]
name = "flate2"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "getrandom"
version = "0.3.3"
@ -173,13 +234,16 @@ dependencies = [
[[package]]
name = "json-archive"
version = "0.1.0"
version = "0.99.0"
dependencies = [
"brotli",
"chrono",
"flate2",
"serde",
"serde_json",
"uuid",
"xflags",
"zstd",
]
[[package]]
@ -227,6 +291,16 @@ version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "miniz_oxide"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
"simd-adler32",
]
[[package]]
name = "num-traits"
version = "0.2.19"
@ -242,6 +316,12 @@ version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "pkg-config"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "proc-macro2"
version = "1.0.101"
@ -340,6 +420,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "simd-adler32"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]]
name = "syn"
version = "2.0.106"
@ -547,3 +633,31 @@ name = "xflags-macros"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "672423d4fea7ffa2f6c25ba60031ea13dc6258070556f125cc4d790007d4a155"
[[package]]
name = "zstd"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.16+zstd.1.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
dependencies = [
"cc",
"pkg-config",
]

View file

@ -36,3 +36,10 @@ path = "fuzz_targets/fuzz_mutations.rs"
test = false
doc = false
bench = false
[[bin]]
name = "fuzz_apply_move"
path = "fuzz_targets/fuzz_apply_move.rs"
test = false
doc = false
bench = false

View file

@ -0,0 +1,186 @@
#![no_main]
use arbitrary::{Arbitrary, Unstructured};
use json_archive::apply_move;
use libfuzzer_sys::fuzz_target;
use serde_json::{json, Value};
#[derive(Arbitrary, Debug)]
struct FuzzMoveInput {
structure: FuzzStructure,
moves: Vec<(u8, u8)>,
}
#[derive(Arbitrary, Debug)]
enum FuzzStructure {
// Direct array at root path
RootArray(Vec<FuzzValue>),
// Object with array field
ObjectWithArray {
field_name: String,
array: Vec<FuzzValue>,
},
// Nested object with array
NestedArray {
outer_field: String,
inner_field: String,
array: Vec<FuzzValue>,
},
// Non-array value (should error)
NonArray(FuzzValue),
}
#[derive(Arbitrary, Debug, Clone)]
enum FuzzValue {
Null,
Bool(bool),
SmallInt(i8),
String(String),
// Limit recursion depth
Array(Vec<SimpleValue>),
Object(Vec<(String, SimpleValue)>),
}
#[derive(Arbitrary, Debug, Clone)]
enum SimpleValue {
Null,
Bool(bool),
SmallInt(i8),
String(String),
}
impl SimpleValue {
fn to_json(&self) -> Value {
match self {
SimpleValue::Null => Value::Null,
SimpleValue::Bool(b) => Value::Bool(*b),
SimpleValue::SmallInt(n) => json!(n),
SimpleValue::String(s) => Value::String(s.clone()),
}
}
}
impl FuzzValue {
fn to_json(&self) -> Value {
match self {
FuzzValue::Null => Value::Null,
FuzzValue::Bool(b) => Value::Bool(*b),
FuzzValue::SmallInt(n) => json!(n),
FuzzValue::String(s) => Value::String(s.clone()),
FuzzValue::Array(arr) => Value::Array(arr.iter().map(|v| v.to_json()).collect()),
FuzzValue::Object(obj) => {
let map: serde_json::Map<String, Value> =
obj.iter().map(|(k, v)| (k.clone(), v.to_json())).collect();
Value::Object(map)
}
}
}
}
impl FuzzStructure {
fn to_json_and_path(&self) -> (Value, String) {
match self {
FuzzStructure::RootArray(arr) => {
let json_arr = Value::Array(arr.iter().map(|v| v.to_json()).collect());
(json!({"root": json_arr}), "/root".to_string())
}
FuzzStructure::ObjectWithArray { field_name, array } => {
let json_arr = Value::Array(array.iter().map(|v| v.to_json()).collect());
let path = format!("/{}", escape_json_pointer(field_name));
(json!({ field_name.clone(): json_arr }), path)
}
FuzzStructure::NestedArray {
outer_field,
inner_field,
array,
} => {
let json_arr = Value::Array(array.iter().map(|v| v.to_json()).collect());
let path = format!(
"/{}/{}",
escape_json_pointer(outer_field),
escape_json_pointer(inner_field)
);
(
json!({ outer_field.clone(): { inner_field.clone(): json_arr } }),
path,
)
}
FuzzStructure::NonArray(val) => {
(json!({"value": val.to_json()}), "/value".to_string())
}
}
}
}
fn escape_json_pointer(s: &str) -> String {
s.replace('~', "~0").replace('/', "~1")
}
fuzz_target!(|data: &[u8]| {
let mut u = Unstructured::new(data);
if let Ok(input) = FuzzMoveInput::arbitrary(&mut u) {
let (mut state, path) = input.structure.to_json_and_path();
let original_state = state.clone();
// Get actual array from original state to compare against
let original_array = get_array_at_path(&original_state, &path).cloned();
// Convert moves to usize
let moves: Vec<(usize, usize)> = input
.moves
.iter()
.map(|(from, to)| (*from as usize, *to as usize))
.collect();
let result = apply_move(&mut state, &path, moves.clone());
match result {
Ok(()) => {
// If successful, verify invariants using actual arrays from JSON
let new_array = get_array_at_path(&state, &path);
if let (Some(orig_arr), Some(new_arr)) = (&original_array, new_array) {
// 1. Array length should be preserved
assert_eq!(
new_arr.len(),
orig_arr.len(),
"Array length changed after move: was {}, now {}",
orig_arr.len(),
new_arr.len()
);
// 2. All original elements should still exist (as a multiset)
let mut orig_sorted: Vec<_> =
orig_arr.iter().map(|v| v.to_string()).collect();
let mut new_sorted: Vec<_> = new_arr.iter().map(|v| v.to_string()).collect();
orig_sorted.sort();
new_sorted.sort();
assert_eq!(
orig_sorted, new_sorted,
"Elements were lost or duplicated during move"
);
}
}
Err(diag) => {
// Error is expected for:
// - Non-array targets
// - Out of bounds indices
// - Invalid paths
// Just make sure we got a proper diagnostic
assert!(!diag.description.is_empty());
}
}
}
});
fn get_array_at_path<'a>(state: &'a Value, path: &str) -> Option<&'a Vec<Value>> {
let parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
let mut current = state;
for part in parts {
let unescaped = part.replace("~1", "/").replace("~0", "~");
current = current.get(&unescaped)?;
}
current.as_array()
}

View file

@ -1,640 +0,0 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
use chrono::Utc;
use serde_json::Value;
use std::fs::{File, OpenOptions};
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
use uuid::Uuid;
use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticLevel};
use crate::diff;
use crate::events::{Event, Header, Observation};
use crate::reader::{ArchiveReader, ReadMode};
pub struct ArchiveWriter {
writer: BufWriter<File>,
observation_count: usize,
snapshot_interval: Option<usize>,
filename: String,
}
impl ArchiveWriter {
pub fn new<P: AsRef<Path>>(
path: P,
snapshot_interval: Option<usize>,
) -> Result<Self, Vec<Diagnostic>> {
let filename = path.as_ref().display().to_string();
let file = match File::create(&path) {
Ok(f) => f,
Err(e) => {
let diagnostic = Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't create the output file: {}", e)
)
.with_advice(
"Make sure you have write permission in this directory and that the path is valid."
.to_string()
);
return Err(vec![diagnostic]);
}
};
let writer = BufWriter::new(file);
Ok(Self {
writer,
observation_count: 0,
snapshot_interval,
filename,
})
}
pub fn new_append<P: AsRef<Path>>(
path: P,
snapshot_interval: Option<usize>,
current_observation_count: usize,
) -> Result<Self, Vec<Diagnostic>> {
let filename = path.as_ref().display().to_string();
let file = match OpenOptions::new().append(true).open(&path) {
Ok(f) => f,
Err(e) => {
let diagnostic = Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't open the archive file for appending: {}", e)
)
.with_advice(
"Make sure the archive file exists and you have write permission."
.to_string()
);
return Err(vec![diagnostic]);
}
};
let writer = BufWriter::new(file);
Ok(Self {
writer,
observation_count: current_observation_count,
snapshot_interval,
filename,
})
}
pub fn write_header(&mut self, header: &Header) -> Result<(), Vec<Diagnostic>> {
let header_json = match serde_json::to_string(header) {
Ok(json) => json,
Err(e) => {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize the header to JSON: {}", e),
)
.with_location(self.filename.clone(), 1)]);
}
};
if let Err(e) = writeln!(self.writer, "{}", header_json) {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output file: {}", e),
)
.with_location(self.filename.clone(), 1)]);
}
Ok(())
}
pub fn write_comment(&mut self, comment: &str) -> Result<(), Vec<Diagnostic>> {
if let Err(e) = writeln!(self.writer, "# {}", comment) {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output file: {}", e),
)]);
}
Ok(())
}
pub fn write_observation(&mut self, observation: Observation) -> Result<(), Vec<Diagnostic>> {
let events = observation.to_events();
for event in events {
let event_json = match serde_json::to_string(&event) {
Ok(json) => json,
Err(e) => {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize an event to JSON: {}", e),
)]);
}
};
if let Err(e) = writeln!(self.writer, "{}", event_json) {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output file: {}", e),
)]);
}
}
self.observation_count += 1;
Ok(())
}
pub fn write_snapshot(&mut self, object: &Value) -> Result<(), Vec<Diagnostic>> {
let snapshot_id = format!("snapshot-{}", Uuid::new_v4());
let snapshot = Event::Snapshot {
observation_id: snapshot_id,
timestamp: Utc::now(),
object: object.clone(),
};
let event_json = match serde_json::to_string(&snapshot) {
Ok(json) => json,
Err(e) => {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize the snapshot to JSON: {}", e),
)]);
}
};
if let Err(e) = writeln!(self.writer, "{}", event_json) {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output file: {}", e),
)]);
}
Ok(())
}
pub fn should_write_snapshot(&self) -> bool {
if let Some(interval) = self.snapshot_interval {
self.observation_count > 0 && self.observation_count % interval == 0
} else {
false
}
}
pub fn finish(mut self) -> Result<(), Vec<Diagnostic>> {
if let Err(e) = self.writer.flush() {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't flush the output file: {}", e),
)]);
}
Ok(())
}
}
pub struct ArchiveBuilder {
initial_state: Option<Value>,
current_state: Value,
source: Option<String>,
snapshot_interval: Option<usize>,
}
impl ArchiveBuilder {
pub fn new() -> Self {
Self {
initial_state: None,
current_state: Value::Null,
source: None,
snapshot_interval: None,
}
}
pub fn with_source(mut self, source: String) -> Self {
self.source = Some(source);
self
}
pub fn with_snapshot_interval(mut self, interval: usize) -> Self {
self.snapshot_interval = Some(interval);
self
}
pub fn add_state(&mut self, state: Value) -> Option<Observation> {
if self.initial_state.is_none() {
self.initial_state = Some(state.clone());
self.current_state = state;
return None;
}
let observation_id = format!("obs-{}", Uuid::new_v4());
let timestamp = Utc::now();
let diff_result: Vec<Event> = diff::diff(&self.current_state, &state, "", &observation_id);
self.current_state = state;
let mut observation = Observation::new(observation_id, timestamp);
for event in diff_result {
observation.add_event(event);
}
Some(observation)
}
pub fn build<P: AsRef<Path>>(self, output_path: P) -> Result<(), Vec<Diagnostic>> {
if self.initial_state.is_none() {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MissingHeaderField,
"I can't build an archive without any initial state.".to_string(),
)]);
}
let header = Header::new(self.initial_state.unwrap(), self.source);
let mut writer = ArchiveWriter::new(output_path, self.snapshot_interval)?;
writer.write_header(&header)?;
writer.finish()?;
Ok(())
}
pub fn get_initial_state(&self) -> Option<&Value> {
self.initial_state.as_ref()
}
}
/// Generate default output filename from input filename
pub fn default_output_filename<P: AsRef<Path>>(input_path: P) -> PathBuf {
let path = input_path.as_ref();
let mut output = path.to_path_buf();
// If it already ends with .json.archive, don't modify it
if let Some(filename) = path.file_name() {
if let Some(filename_str) = filename.to_str() {
if filename_str.ends_with(".json.archive") {
return output;
}
}
}
// Add .json.archive extension
if let Some(extension) = path.extension() {
if extension == "json" {
// Replace .json with .json.archive
output.set_extension("json.archive");
} else {
// Append .json.archive to whatever extension exists
let new_extension = format!("{}.json.archive", extension.to_string_lossy());
output.set_extension(new_extension);
}
} else {
// No extension, just add .json.archive
output.set_extension("json.archive");
}
output
}
pub fn create_archive_from_files<P: AsRef<Path>>(
input_files: &[P],
output_path: P,
source: Option<String>,
snapshot_interval: Option<usize>,
) -> Result<(), Vec<Diagnostic>> {
let mut builder = ArchiveBuilder::new();
if let Some(source) = source {
builder = builder.with_source(source);
}
if let Some(interval) = snapshot_interval {
builder = builder.with_snapshot_interval(interval);
}
let first_content = std::fs::read_to_string(&input_files[0]).map_err(|e| {
vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't read the first input file: {}", e),
)]
})?;
let first_state: Value = serde_json::from_str(&first_content).map_err(|e| {
vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse the first input file as JSON: {}", e),
)
.with_advice("Make sure the file contains valid JSON.".to_string())]
})?;
let _ = builder.add_state(first_state.clone());
let header = Header::new(first_state, builder.source.clone());
let mut writer = ArchiveWriter::new(&output_path, builder.snapshot_interval)?;
writer.write_header(&header)?;
for file_path in input_files[1..].iter() {
writer.write_comment(&format!("Processing file: {:?}", file_path.as_ref()))?;
let content = std::fs::read_to_string(file_path).map_err(|e| {
vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't read the input file: {}", e),
)]
})?;
let state: Value = serde_json::from_str(&content).map_err(|e| {
vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse the input file as JSON: {}", e),
)
.with_advice("Make sure the file contains valid JSON.".to_string())]
})?;
if let Some(observation) = builder.add_state(state.clone()) {
writer.write_observation(observation)?;
if writer.should_write_snapshot() {
writer.write_snapshot(&state)?;
}
}
}
writer.finish()?;
Ok(())
}
pub fn append_to_archive<P: AsRef<Path>, Q: AsRef<Path>>(
archive_path: P,
new_files: &[Q],
output_path: P,
source: Option<String>,
snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
// Read the existing archive to get the final state
let reader = match ArchiveReader::new(&archive_path, ReadMode::AppendSeek) {
Ok(r) => r,
Err(e) => {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't open the archive for reading: {}", e),
)];
}
};
let read_result = match reader.read(&archive_path) {
Ok(result) => result,
Err(e) => {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't read the archive: {}", e),
)];
}
};
// Check for fatal diagnostics in the archive
if read_result.diagnostics.has_fatal() {
let mut diagnostics = vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
"The existing archive contains fatal errors. Cannot append to a corrupt archive.".to_string(),
)];
diagnostics.extend(read_result.diagnostics.into_diagnostics());
return diagnostics;
}
// If output path is different from archive path, copy the archive first
if archive_path.as_ref() != output_path.as_ref() {
if let Err(e) = std::fs::copy(&archive_path, &output_path) {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't copy the archive to the output location: {}", e),
)];
}
}
// Create an append writer
let mut writer = match ArchiveWriter::new_append(&output_path, snapshot_interval, read_result.observation_count) {
Ok(w) => w,
Err(diagnostics) => return diagnostics,
};
// Create a builder to track state changes
let mut builder = ArchiveBuilder::new();
if let Some(source) = source {
builder = builder.with_source(source);
}
if let Some(interval) = snapshot_interval {
builder = builder.with_snapshot_interval(interval);
}
// Initialize builder with the final state from the archive
let current_state = read_result.final_state;
builder.current_state = current_state.clone();
builder.initial_state = Some(current_state.clone());
// Process each new file
for file_path in new_files.iter() {
if let Err(diagnostics) = writer.write_comment(&format!("Processing file: {:?}", file_path.as_ref())) {
return diagnostics;
}
let content = match std::fs::read_to_string(file_path) {
Ok(content) => content,
Err(e) => {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't read the input file: {}", e),
)];
}
};
let state: Value = match serde_json::from_str(&content) {
Ok(state) => state,
Err(e) => {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse the input file as JSON: {}", e),
)
.with_advice("Make sure the file contains valid JSON.".to_string())];
}
};
if let Some(observation) = builder.add_state(state.clone()) {
if let Err(diagnostics) = writer.write_observation(observation) {
return diagnostics;
}
if writer.should_write_snapshot() {
if let Err(diagnostics) = writer.write_snapshot(&state) {
return diagnostics;
}
}
}
}
// Finish writing
match writer.finish() {
Ok(()) => Vec::new(),
Err(diagnostics) => diagnostics,
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_archive_writer_header() -> Result<(), Box<dyn std::error::Error>> {
let temp_file = NamedTempFile::new()?;
let header = Header::new(json!({"test": "value"}), Some("test-source".to_string()));
{
let mut writer = ArchiveWriter::new(temp_file.path(), None)
.map_err(|_| "Failed to create writer")?;
writer
.write_header(&header)
.map_err(|_| "Failed to write header")?;
writer.finish().map_err(|_| "Failed to finish")?;
}
let content = std::fs::read_to_string(temp_file.path())?;
let lines: Vec<&str> = content.lines().collect();
assert_eq!(lines.len(), 1);
let parsed_header: Header = serde_json::from_str(lines[0])?;
assert_eq!(parsed_header.file_type, "@peoplesgrocers/json-archive");
assert_eq!(parsed_header.version, 1);
assert_eq!(parsed_header.initial, json!({"test": "value"}));
Ok(())
}
#[test]
fn test_archive_builder() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = ArchiveBuilder::new();
// First state becomes initial
let result = builder.add_state(json!({"count": 0}));
assert!(result.is_none());
// Second state generates observation
let observation = builder
.add_state(json!({"count": 1}))
.expect("Should generate observation");
assert!(!observation.events.is_empty());
Ok(())
}
#[test]
fn test_create_archive_from_files() -> Result<(), Box<dyn std::error::Error>> {
// Create temporary input files
let mut file1 = NamedTempFile::new()?;
let mut file2 = NamedTempFile::new()?;
let output_file = NamedTempFile::new()?;
writeln!(file1, r#"{{"count": 0, "name": "test"}}"#)?;
writeln!(file2, r#"{{"count": 1, "name": "test"}}"#)?;
let input_files = vec![file1.path(), file2.path()];
create_archive_from_files(
&input_files,
output_file.path(),
Some("test-source".to_string()),
None,
)
.map_err(|_| "Failed to create archive")?;
let content = std::fs::read_to_string(output_file.path())?;
let lines: Vec<&str> = content.lines().collect();
assert!(lines.len() >= 2); // At least header + comment + observe + change events
// First line should be header
let header: Header = serde_json::from_str(lines[0])?;
assert_eq!(header.file_type, "@peoplesgrocers/json-archive");
assert_eq!(header.version, 1);
assert_eq!(header.initial, json!({"count": 0, "name": "test"}));
Ok(())
}
#[test]
fn test_snapshot_interval() -> Result<(), Box<dyn std::error::Error>> {
let temp_file = NamedTempFile::new()?;
let mut writer =
ArchiveWriter::new(temp_file.path(), Some(2)).map_err(|_| "Failed to create writer")?;
assert!(!writer.should_write_snapshot()); // No observations yet
let obs1 = Observation::new("obs-1".to_string(), Utc::now());
writer
.write_observation(obs1)
.map_err(|_| "Failed to write observation")?;
assert!(!writer.should_write_snapshot()); // 1 observation, interval is 2
let obs2 = Observation::new("obs-2".to_string(), Utc::now());
writer
.write_observation(obs2)
.map_err(|_| "Failed to write observation")?;
assert!(writer.should_write_snapshot()); // 2 observations, should snapshot
Ok(())
}
#[test]
fn test_default_output_filename() {
assert_eq!(
default_output_filename("test.json"),
PathBuf::from("test.json.archive")
);
assert_eq!(
default_output_filename("test.txt"),
PathBuf::from("test.txt.json.archive")
);
assert_eq!(
default_output_filename("test"),
PathBuf::from("test.json.archive")
);
assert_eq!(
default_output_filename("test.json.archive"),
PathBuf::from("test.json.archive")
);
}
}

595
src/archive_context.rs Normal file
View file

@ -0,0 +1,595 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
//! Archive write context and shared observation writing logic.
//!
//! This module provides:
//! - `WriteContext`: A struct that holds the state needed to write observations
//! - `write_observations`: The shared logic for diffing JSON files and writing events
//!
//! The key insight is that both create and append operations share the same
//! core logic once they've set up their initial state and writer.
use chrono::{DateTime, Utc};
use serde_json::Value;
use std::io::Write;
use std::path::{Path, PathBuf};
use uuid::Uuid;
use crate::atomic_file::atomic_replace_file;
use crate::detection::CompressionFormat;
use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector};
use crate::diff;
use crate::events::{Event, Observation};
/// Strategy for finishing the write operation.
#[derive(Debug, Clone)]
pub enum FinishStrategy {
/// Just flush the writer. Used for:
/// - Creating new archives
/// - Appending to uncompressed archives (same file)
FlushOnly,
/// Atomic replace: swap temp file with original. Used for:
/// - Appending to compressed archives (rewrite strategy)
AtomicReplace {
temp_path: PathBuf,
output_path: PathBuf,
},
}
/// Context for writing observations to an archive.
///
/// This struct is the result of the "setup phase" for both create and append
/// operations. Once you have a WriteContext, you can use `write_observations`
/// to add new states, then call `finish` to complete the operation.
pub struct WriteContext<W: Write> {
/// The writer to output JSON lines to.
pub writer: W,
/// Current state of the archive (used for diffing).
pub current_state: Value,
/// Number of observations already in the archive.
pub observation_count: usize,
/// Optional interval for writing snapshots.
pub snapshot_interval: Option<usize>,
/// How to finish the write operation.
pub finish_strategy: FinishStrategy,
/// Diagnostics collected during setup (e.g., warnings from reading existing archive).
pub diagnostics: DiagnosticCollector,
}
impl<W: Write> WriteContext<W> {
/// Create a new write context.
pub fn new(
writer: W,
current_state: Value,
observation_count: usize,
snapshot_interval: Option<usize>,
finish_strategy: FinishStrategy,
) -> Self {
Self {
writer,
current_state,
observation_count,
snapshot_interval,
finish_strategy,
diagnostics: DiagnosticCollector::new(),
}
}
/// Create a write context with existing diagnostics.
pub fn with_diagnostics(
writer: W,
current_state: Value,
observation_count: usize,
snapshot_interval: Option<usize>,
finish_strategy: FinishStrategy,
diagnostics: DiagnosticCollector,
) -> Self {
Self {
writer,
current_state,
observation_count,
snapshot_interval,
finish_strategy,
diagnostics,
}
}
/// Write observations for a list of JSON files.
///
/// For each file:
/// 1. Reads and parses the JSON
/// 2. Diffs against current state
/// 3. Writes observation events
/// 4. Optionally writes a snapshot if interval is reached
/// 5. Updates current state
///
/// Returns the number of observations written.
pub fn write_observations<P: AsRef<Path>>(
&mut self,
files: &[P],
) -> Result<usize, Vec<Diagnostic>> {
let mut observations_written = 0;
for file_path in files.iter() {
let file_path = file_path.as_ref();
// Write comment marking which file we're processing
if let Err(e) = writeln!(self.writer, "# Processing file: {}", file_path.display()) {
return Err(vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output: {}", e),
)]);
}
// Get file modification time for the observation timestamp
let file_mtime = get_file_mtime(file_path)?;
// Read and parse new state
let content = std::fs::read_to_string(file_path).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read the input file '{}': {}", file_path.display(), e),
)]
})?;
let new_state: Value = serde_json::from_str(&content).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse '{}' as JSON: {}", file_path.display(), e),
)
.with_advice("Make sure the file contains valid JSON.".to_string())]
})?;
// Generate diff and create observation
let observation_id = format!("obs-{}", Uuid::new_v4());
let diff_events = diff::diff(&self.current_state, &new_state, "", &observation_id);
// Skip if no changes
if diff_events.is_empty() {
continue;
}
// Create and write observation
let mut observation = Observation::new(observation_id, file_mtime);
for event in diff_events {
observation.add_event(event);
}
self.write_observation(observation)?;
observations_written += 1;
self.observation_count += 1;
// Check if we should write a snapshot
if self.should_write_snapshot() {
self.write_snapshot(&new_state, file_mtime)?;
}
// Update current state for next iteration
self.current_state = new_state;
}
Ok(observations_written)
}
/// Write a single observation's events to the output.
fn write_observation(&mut self, observation: Observation) -> Result<(), Vec<Diagnostic>> {
for event in observation.to_events() {
let event_json = serde_json::to_string(&event).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize an event to JSON: {}", e),
)]
})?;
writeln!(self.writer, "{}", event_json).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output: {}", e),
)]
})?;
}
Ok(())
}
/// Check if we should write a snapshot based on observation count.
fn should_write_snapshot(&self) -> bool {
if let Some(interval) = self.snapshot_interval {
self.observation_count > 0 && self.observation_count % interval == 0
} else {
false
}
}
/// Write a snapshot event.
fn write_snapshot(&mut self, state: &Value, timestamp: DateTime<Utc>) -> Result<(), Vec<Diagnostic>> {
let snapshot_id = format!("snapshot-{}", Uuid::new_v4());
let snapshot = Event::Snapshot {
observation_id: snapshot_id,
timestamp,
object: state.clone(),
};
let snapshot_json = serde_json::to_string(&snapshot).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize the snapshot to JSON: {}", e),
)]
})?;
writeln!(self.writer, "{}", snapshot_json).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output: {}", e),
)]
})?;
Ok(())
}
/// Finish the write operation.
///
/// This flushes the writer and, for compressed append operations,
/// performs the atomic file replacement.
pub fn finish(mut self) -> Result<DiagnosticCollector, Vec<Diagnostic>> {
// Flush the writer
self.writer.flush().map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't flush the output file: {}", e),
)]
})?;
// Handle atomic replacement if needed
match self.finish_strategy {
FinishStrategy::FlushOnly => {
// Nothing more to do
}
FinishStrategy::AtomicReplace { temp_path, output_path } => {
atomic_replace_file(&output_path, &temp_path)?;
}
}
Ok(self.diagnostics)
}
}
/// Get the file modification time as a DateTime<Utc>.
fn get_file_mtime<P: AsRef<Path>>(path: P) -> Result<DateTime<Utc>, Vec<Diagnostic>> {
let path = path.as_ref();
let metadata = std::fs::metadata(path).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't get metadata for '{}': {}", path.display(), e),
)]
})?;
let modified = metadata.modified().map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't get modification time for '{}': {}", path.display(), e),
)]
})?;
Ok(modified.into())
}
/// Encoder wrapper that provides a uniform interface for different compression formats.
///
/// This enum wraps the various compression encoders so we can treat them uniformly
/// in the append-to-compressed-archive flow.
#[cfg(feature = "compression")]
pub enum CompressedWriter {
Gzip(flate2::write::GzEncoder<std::fs::File>),
Zlib(flate2::write::ZlibEncoder<std::fs::File>),
Zstd(zstd::stream::write::Encoder<'static, std::fs::File>),
Brotli(brotli::CompressorWriter<std::fs::File>),
}
#[cfg(feature = "compression")]
impl Write for CompressedWriter {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
match self {
CompressedWriter::Gzip(w) => w.write(buf),
CompressedWriter::Zlib(w) => w.write(buf),
CompressedWriter::Zstd(w) => w.write(buf),
CompressedWriter::Brotli(w) => w.write(buf),
}
}
fn flush(&mut self) -> std::io::Result<()> {
match self {
CompressedWriter::Gzip(w) => w.flush(),
CompressedWriter::Zlib(w) => w.flush(),
CompressedWriter::Zstd(w) => w.flush(),
CompressedWriter::Brotli(w) => w.flush(),
}
}
}
#[cfg(feature = "compression")]
impl CompressedWriter {
/// Create a new compressed writer for the given format and file.
pub fn new(format: CompressionFormat, file: std::fs::File) -> Result<Self, Diagnostic> {
use flate2::Compression;
match format {
CompressionFormat::Gzip => {
Ok(CompressedWriter::Gzip(flate2::write::GzEncoder::new(file, Compression::default())))
}
CompressionFormat::Zlib => {
Ok(CompressedWriter::Zlib(flate2::write::ZlibEncoder::new(file, Compression::default())))
}
CompressionFormat::Zstd => {
let encoder = zstd::stream::write::Encoder::new(file, 0).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't create zstd encoder: {}", e),
)
})?;
Ok(CompressedWriter::Zstd(encoder))
}
CompressionFormat::Brotli => {
Ok(CompressedWriter::Brotli(brotli::CompressorWriter::new(file, 4096, 11, 22)))
}
CompressionFormat::Deflate => {
// Deflate is typically used within gzip/zlib, not standalone for files
Err(Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
"Standalone deflate compression is not supported for writing.".to_string(),
))
}
CompressionFormat::None => {
Err(Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
"CompressedWriter::new called with CompressionFormat::None".to_string(),
))
}
}
}
/// Finish compression and return any errors.
///
/// This must be called before the file is closed to ensure all
/// compressed data is flushed.
pub fn finish(self) -> Result<(), Diagnostic> {
match self {
CompressedWriter::Gzip(w) => {
w.finish().map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't finish gzip compression: {}", e),
)
})?;
}
CompressedWriter::Zlib(w) => {
w.finish().map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't finish zlib compression: {}", e),
)
})?;
}
CompressedWriter::Zstd(w) => {
w.finish().map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't finish zstd compression: {}", e),
)
})?;
}
CompressedWriter::Brotli(mut w) => {
// Brotli doesn't have a finish() method, flush is sufficient
w.flush().map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't flush brotli compression: {}", e),
)
})?;
}
}
Ok(())
}
}
/// A write context specifically for compressed output.
///
/// This wraps WriteContext to handle the finish() call properly for
/// compressed writers, which need to call finish() on the encoder
/// before the atomic file swap.
#[cfg(feature = "compression")]
pub struct CompressedWriteContext {
/// The inner write context.
inner: WriteContext<CompressedWriter>,
}
#[cfg(feature = "compression")]
impl CompressedWriteContext {
/// Create a new compressed write context.
pub fn new(
writer: CompressedWriter,
current_state: Value,
observation_count: usize,
snapshot_interval: Option<usize>,
finish_strategy: FinishStrategy,
diagnostics: DiagnosticCollector,
) -> Self {
Self {
inner: WriteContext::with_diagnostics(
writer,
current_state,
observation_count,
snapshot_interval,
finish_strategy,
diagnostics,
),
}
}
/// Write observations for a list of JSON files.
pub fn write_observations<P: AsRef<Path>>(
&mut self,
files: &[P],
) -> Result<usize, Vec<Diagnostic>> {
self.inner.write_observations(files)
}
/// Write raw bytes to the output (used for copying existing archive content).
pub fn write_raw(&mut self, bytes: &[u8]) -> Result<(), Vec<Diagnostic>> {
self.inner.writer.write_all(bytes).map_err(|e| {
vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output: {}", e),
)]
})
}
/// Finish the write operation.
///
/// This finishes the compression encoder, then performs any atomic
/// file operations needed.
pub fn finish(self) -> Result<DiagnosticCollector, Vec<Diagnostic>> {
let finish_strategy = self.inner.finish_strategy.clone();
let diagnostics = self.inner.diagnostics;
// Finish compression first
self.inner.writer.finish().map_err(|d| vec![d])?;
// Then handle atomic replacement if needed
match finish_strategy {
FinishStrategy::FlushOnly => {
// Nothing more to do
}
FinishStrategy::AtomicReplace { temp_path, output_path } => {
atomic_replace_file(&output_path, &temp_path)?;
}
}
Ok(diagnostics)
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_write_context_single_observation() {
let mut output = Vec::new();
let initial_state = json!({"count": 0});
{
let mut ctx = WriteContext::new(
&mut output,
initial_state,
0,
None,
FinishStrategy::FlushOnly,
);
// Create a temp file with new state
let mut temp_file = tempfile::NamedTempFile::new().unwrap();
std::io::Write::write_all(&mut temp_file, br#"{"count": 1}"#).unwrap();
temp_file.flush().unwrap();
let count = ctx.write_observations(&[temp_file.path()]).unwrap();
assert_eq!(count, 1);
}
let output_str = String::from_utf8(output).unwrap();
assert!(output_str.contains("# Processing file:"));
assert!(output_str.contains("observe"));
assert!(output_str.contains("change"));
assert!(output_str.contains("/count"));
}
#[test]
fn test_write_context_no_changes() {
let mut output = Vec::new();
let initial_state = json!({"count": 0});
{
let mut ctx = WriteContext::new(
&mut output,
initial_state,
0,
None,
FinishStrategy::FlushOnly,
);
// Create a temp file with same state
let mut temp_file = tempfile::NamedTempFile::new().unwrap();
std::io::Write::write_all(&mut temp_file, br#"{"count": 0}"#).unwrap();
temp_file.flush().unwrap();
let count = ctx.write_observations(&[temp_file.path()]).unwrap();
assert_eq!(count, 0);
}
let output_str = String::from_utf8(output).unwrap();
// Should have comment but no events
assert!(output_str.contains("# Processing file:"));
assert!(!output_str.contains("observe"));
}
#[test]
fn test_should_write_snapshot() {
let output: Vec<u8> = Vec::new();
// No interval set
let ctx: WriteContext<Vec<u8>> = WriteContext::new(
output.clone(),
json!({}),
5,
None,
FinishStrategy::FlushOnly,
);
assert!(!ctx.should_write_snapshot());
// Interval of 2, at observation 4 (multiple of 2)
let ctx: WriteContext<Vec<u8>> = WriteContext::new(
output.clone(),
json!({}),
4,
Some(2),
FinishStrategy::FlushOnly,
);
assert!(ctx.should_write_snapshot());
// Interval of 2, at observation 3 (not multiple of 2)
let ctx: WriteContext<Vec<u8>> = WriteContext::new(
output,
json!({}),
3,
Some(2),
FinishStrategy::FlushOnly,
);
assert!(!ctx.should_write_snapshot());
}
}

233
src/archive_open.rs Normal file
View file

@ -0,0 +1,233 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
//! Unified archive file opening with compression detection.
//!
//! This module provides a single entry point for opening archive files that:
//! - Detects compression format from magic bytes
//! - Creates the appropriate decompressor
//! - Returns a unified `BufRead` interface
//!
//! This eliminates duplicated compression detection logic across the codebase.
use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use crate::detection::{detect_compression_format, CompressionFormat};
use crate::diagnostics::{Diagnostic, DiagnosticCode};
#[cfg(feature = "compression")]
use brotli::Decompressor;
#[cfg(feature = "compression")]
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
#[cfg(feature = "compression")]
use zstd::stream::read::Decoder as ZstdDecoder;
/// Result of opening an archive file for reading.
pub struct OpenedArchive {
/// Buffered reader that handles decompression transparently.
pub reader: Box<dyn BufRead>,
/// The detected compression format.
pub format: CompressionFormat,
}
/// Opens an archive file and returns a buffered reader that handles decompression.
///
/// This function:
/// 1. Opens the file
/// 2. Reads magic bytes to detect compression
/// 3. Reopens and wraps with appropriate decompressor
/// 4. Returns a unified `BufRead` interface
///
/// # Arguments
///
/// * `path` - Path to the archive file
///
/// # Returns
///
/// Returns `OpenedArchive` containing the reader and detected format,
/// or a diagnostic if the file couldn't be opened.
///
/// # Feature flags
///
/// When built without the `compression` feature, compressed files will still
/// be detected but will return an error diagnostic. The caller should check
/// the format and handle this case appropriately.
pub fn open_archive<P: AsRef<Path>>(path: P) -> Result<OpenedArchive, Diagnostic> {
let path = path.as_ref();
let filename = path.display().to_string();
// Open file and read magic bytes
let mut file = File::open(path).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't open the archive file: {}", e),
)
.with_location(filename.clone(), 1)
})?;
let mut magic_bytes = [0u8; 4];
let bytes_read = file.read(&mut magic_bytes).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read from the archive file: {}", e),
)
.with_location(filename.clone(), 1)
})?;
let format = detect_compression_format(path, &magic_bytes[..bytes_read]);
// Reopen file to reset position
let file = File::open(path).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't reopen the archive file: {}", e),
)
.with_location(filename.clone(), 1)
})?;
// Create appropriate reader based on compression format
#[cfg(feature = "compression")]
let reader: Box<dyn BufRead> = match format {
CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))),
CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))),
CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))),
CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))),
CompressionFormat::Zstd => {
let decoder = ZstdDecoder::new(file).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't create zstd decoder: {}", e),
)
.with_location(filename.clone(), 1)
})?;
Box::new(BufReader::new(decoder))
}
CompressionFormat::None => Box::new(BufReader::new(file)),
};
#[cfg(not(feature = "compression"))]
let reader: Box<dyn BufRead> = Box::new(BufReader::new(file));
Ok(OpenedArchive { reader, format })
}
/// Checks if the detected compression format is supported by this build.
///
/// Returns a diagnostic error if compression was detected but the binary
/// was built without compression support.
#[cfg_attr(feature = "compression", allow(unused_variables))]
pub fn check_compression_support(
format: CompressionFormat,
filename: &str,
) -> Result<(), Diagnostic> {
#[cfg(not(feature = "compression"))]
if format != CompressionFormat::None {
let format_name = match format {
CompressionFormat::Gzip => "gzip",
CompressionFormat::Deflate => "deflate",
CompressionFormat::Zlib => "zlib",
CompressionFormat::Brotli => "brotli",
CompressionFormat::Zstd => "zstd",
CompressionFormat::None => unreachable!(),
};
return Err(Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
format!(
"I detected a {}-compressed archive, but this build doesn't support compression.",
format_name
),
)
.with_location(filename.to_string(), 1)
.with_advice(
"This binary was built without compression support to reduce binary size and dependencies.\n\
You have two options:\n\
1. Install the version with compression support: cargo install json-archive --features compression\n\
2. Manually decompress the file first, then use this tool on the uncompressed archive"
.to_string(),
));
}
Ok(())
}
/// Convenience function to check if a file is compressed.
///
/// This opens the file, reads magic bytes, and returns the compression format.
/// Useful when you need to know the format before deciding how to process the file.
pub fn detect_archive_compression<P: AsRef<Path>>(path: P) -> Result<CompressionFormat, Diagnostic> {
let path = path.as_ref();
let filename = path.display().to_string();
let mut file = File::open(path).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't open the file to check compression: {}", e),
)
.with_location(filename.clone(), 1)
})?;
let mut magic_bytes = [0u8; 4];
let bytes_read = file.read(&mut magic_bytes).map_err(|e| {
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read from the file: {}", e),
)
.with_location(filename, 1)
})?;
Ok(detect_compression_format(path, &magic_bytes[..bytes_read]))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_open_uncompressed_archive() {
let mut temp_file = NamedTempFile::new().unwrap();
writeln!(temp_file, r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#).unwrap();
temp_file.flush().unwrap();
let opened = open_archive(temp_file.path()).unwrap();
assert_eq!(opened.format, CompressionFormat::None);
}
#[test]
fn test_detect_archive_compression_uncompressed() {
let mut temp_file = NamedTempFile::new().unwrap();
writeln!(temp_file, "plain text content").unwrap();
temp_file.flush().unwrap();
let format = detect_archive_compression(temp_file.path()).unwrap();
assert_eq!(format, CompressionFormat::None);
}
#[test]
fn test_open_nonexistent_file() {
let result = open_archive("/nonexistent/path/to/file.json.archive");
assert!(result.is_err());
}
}

644
src/archive_ops.rs Normal file
View file

@ -0,0 +1,644 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
//! High-level archive operations: create and append.
//!
//! This module provides the top-level entry points for creating and appending
//! to archives. These functions handle all the setup (opening files, detecting
//! compression, reading existing state) and then delegate to the shared
//! `WriteContext` for the actual observation writing.
//!
//! ## Architecture
//!
//! ```text
//! ┌─────────────────┐
//! │ archive_ops.rs │
//! │ (this module) │
//! └────────┬────────┘
//! │
//! ┌─────────────────┼─────────────────┐
//! │ │ │
//! ▼ ▼ ▼
//! ┌───────────────┐ ┌───────────────┐ ┌───────────────┐
//! │ archive_open │ │archive_context│ │ archive_reader│
//! │ (compression) │ │ (WriteContext)│ │ (parsing) │
//! └───────────────┘ └───────────────┘ └───────────────┘
//! ```
//!
//! ## Operations
//!
//! - `create_archive`: Create a new archive from one or more JSON files
//! - `append_to_archive`: Add observations to an existing archive
use std::fs::{File, OpenOptions};
use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use serde_json::Value;
use crate::archive_context::{FinishStrategy, WriteContext};
use crate::archive_open::{check_compression_support, detect_archive_compression, open_archive};
use crate::archive_reader::{ArchiveReader, ReadMode};
use crate::atomic_file::generate_temp_filename;
use crate::detection::CompressionFormat;
use crate::diagnostics::{Diagnostic, DiagnosticCode};
use crate::events::Header;
#[cfg(feature = "compression")]
use crate::archive_context::{CompressedWriteContext, CompressedWriter};
/// Create a new archive from a list of JSON files.
///
/// The first file becomes the initial state in the header. Each subsequent
/// file generates an observation with the diff from the previous state.
///
/// # Arguments
///
/// * `input_files` - List of JSON files to process (at least one required)
/// * `output_path` - Path for the new archive file
/// * `source` - Optional source identifier for the header
/// * `snapshot_interval` - Optional interval for writing snapshots
///
/// # Returns
///
/// Returns an empty Vec on success, or a Vec of diagnostics on error.
pub fn create_archive<P: AsRef<Path>>(
input_files: &[P],
output_path: P,
source: Option<String>,
snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
if input_files.is_empty() {
return vec![Diagnostic::fatal(
DiagnosticCode::MissingHeaderField,
"I need at least one input file to create an archive.".to_string(),
)];
}
// Read and parse the first file to get initial state
let first_path = input_files[0].as_ref();
let first_content = match std::fs::read_to_string(first_path) {
Ok(content) => content,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read the first input file '{}': {}", first_path.display(), e),
)];
}
};
let initial_state: Value = match serde_json::from_str(&first_content) {
Ok(state) => state,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse '{}' as JSON: {}", first_path.display(), e),
)
.with_advice("Make sure the file contains valid JSON.".to_string())];
}
};
// Create the output file
let output_path = output_path.as_ref();
let file = match File::create(output_path) {
Ok(f) => f,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't create the output file '{}': {}", output_path.display(), e),
)
.with_advice(
"Make sure you have write permission in this directory and that the path is valid."
.to_string(),
)];
}
};
let mut writer = BufWriter::new(file);
// Write the header
let header = Header::new(initial_state.clone(), source);
let header_json = match serde_json::to_string(&header) {
Ok(json) => json,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't serialize the header to JSON: {}", e),
)];
}
};
if let Err(e) = writeln!(writer, "{}", header_json) {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't write to the output file: {}", e),
)];
}
// If there are more files, process them through WriteContext
if input_files.len() > 1 {
let mut ctx = WriteContext::new(
writer,
initial_state,
0,
snapshot_interval,
FinishStrategy::FlushOnly,
);
// Process remaining files (skip the first one which is now the initial state)
let remaining_files: Vec<&Path> = input_files[1..].iter().map(|p| p.as_ref()).collect();
if let Err(diagnostics) = ctx.write_observations(&remaining_files) {
return diagnostics;
}
if let Err(diagnostics) = ctx.finish() {
return diagnostics;
}
} else {
// Just flush the header
if let Err(e) = writer.flush() {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't flush the output file: {}", e),
)];
}
}
Vec::new()
}
/// Append observations to an existing archive.
///
/// This function handles both compressed and uncompressed archives:
/// - Uncompressed: Opens in append mode and writes new observations directly
/// - Compressed: Reads entire archive, writes to temp file, atomic swap
///
/// # Arguments
///
/// * `archive_path` - Path to the existing archive
/// * `new_files` - List of JSON files to add as observations
/// * `output_path` - Where to write the result (can be same as archive_path)
/// * `source` - Optional source identifier (not currently used for append)
/// * `snapshot_interval` - Optional interval for writing snapshots
///
/// # Returns
///
/// Returns an empty Vec on success, or a Vec of diagnostics on error.
pub fn append_to_archive<P: AsRef<Path>, Q: AsRef<Path>>(
archive_path: P,
new_files: &[Q],
output_path: P,
_source: Option<String>,
snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
let archive_path = archive_path.as_ref();
let output_path = output_path.as_ref();
// Detect compression format
let format = match detect_archive_compression(archive_path) {
Ok(f) => f,
Err(diag) => return vec![diag],
};
// Check if this build supports the detected compression
if let Err(diag) = check_compression_support(format, &archive_path.display().to_string()) {
return vec![diag];
}
if format == CompressionFormat::None {
append_to_uncompressed_archive(archive_path, new_files, output_path, snapshot_interval)
} else {
append_to_compressed_archive(archive_path, new_files, output_path, format, snapshot_interval)
}
}
/// Append to an uncompressed archive.
///
/// This reads the archive to get the final state, then opens the file
/// in append mode to add new observations.
fn append_to_uncompressed_archive<P: AsRef<Path>, Q: AsRef<Path>>(
archive_path: P,
new_files: &[Q],
output_path: P,
snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
let archive_path = archive_path.as_ref();
let output_path = output_path.as_ref();
// Read the existing archive to get final state
let reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) {
Ok(r) => r,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't open the archive for reading: {}", e),
)];
}
};
let read_result = match reader.read(archive_path) {
Ok(result) => result,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read the archive: {}", e),
)];
}
};
// Check for fatal diagnostics in the archive
if read_result.diagnostics.has_fatal() {
let mut diagnostics = vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
"The existing archive contains fatal errors. Cannot append to a corrupt archive."
.to_string(),
)];
diagnostics.extend(read_result.diagnostics.into_diagnostics());
return diagnostics;
}
// If output path is different from archive path, copy the archive first
if archive_path != output_path {
if let Err(e) = std::fs::copy(archive_path, output_path) {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't copy the archive to the output location: {}", e),
)];
}
}
// Open file in append mode
let file = match OpenOptions::new().append(true).open(output_path) {
Ok(f) => f,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't open the archive file for appending: {}", e),
)
.with_advice(
"Make sure the archive file exists and you have write permission.".to_string(),
)];
}
};
// Create write context and process files
let mut ctx = WriteContext::with_diagnostics(
file,
read_result.final_state,
read_result.observation_count,
snapshot_interval,
FinishStrategy::FlushOnly,
read_result.diagnostics,
);
let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect();
if let Err(diagnostics) = ctx.write_observations(&file_refs) {
return diagnostics;
}
match ctx.finish() {
Ok(collector) => collector.into_diagnostics(),
Err(diagnostics) => diagnostics,
}
}
/// Append to a compressed archive.
///
/// This reads the entire archive (decompressing), writes everything to a
/// new compressed temp file with the new observations, then atomically
/// swaps the temp file with the original.
#[cfg(feature = "compression")]
fn append_to_compressed_archive<P: AsRef<Path>, Q: AsRef<Path>>(
archive_path: P,
new_files: &[Q],
output_path: P,
format: CompressionFormat,
snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
let archive_path = archive_path.as_ref();
let output_path = output_path.as_ref();
// Step 1: Open and decompress the archive, reading all bytes
let opened = match open_archive(archive_path) {
Ok(o) => o,
Err(diag) => return vec![diag],
};
// Read all decompressed bytes into memory
let mut decompressed_bytes = Vec::new();
let mut reader = opened.reader;
if let Err(e) = reader.read_to_end(&mut decompressed_bytes) {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't read the compressed archive: {}", e),
)];
}
// Step 2: Parse the archive to get final state using AppendSeek mode
// We need to re-read from the decompressed bytes
let archive_reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) {
Ok(r) => r,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't create archive reader: {}", e),
)];
}
};
let read_result = match archive_reader.read(archive_path) {
Ok(result) => result,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't parse the archive: {}", e),
)];
}
};
// Check for fatal diagnostics
if read_result.diagnostics.has_fatal() {
let mut diagnostics = vec![Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
"The existing archive contains fatal errors. Cannot append to a corrupt archive."
.to_string(),
)];
diagnostics.extend(read_result.diagnostics.into_diagnostics());
return diagnostics;
}
// Step 3: Create temp file with same compression format
let temp_path = generate_temp_filename(output_path);
let temp_file = match File::create(&temp_path) {
Ok(f) => f,
Err(e) => {
return vec![Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't create temp file: {}", e),
)];
}
};
// Create compressed writer
let compressed_writer = match CompressedWriter::new(format, temp_file) {
Ok(w) => w,
Err(diag) => {
let _ = std::fs::remove_file(&temp_path);
return vec![diag];
}
};
// Step 4: Create write context and copy old data + write new observations
let mut ctx = CompressedWriteContext::new(
compressed_writer,
read_result.final_state,
read_result.observation_count,
snapshot_interval,
FinishStrategy::AtomicReplace {
temp_path: temp_path.clone(),
output_path: output_path.to_path_buf(),
},
read_result.diagnostics,
);
// Write all old decompressed bytes first
if let Err(diagnostics) = ctx.write_raw(&decompressed_bytes) {
let _ = std::fs::remove_file(&temp_path);
return diagnostics;
}
// Write new observations
let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect();
if let Err(diagnostics) = ctx.write_observations(&file_refs) {
let _ = std::fs::remove_file(&temp_path);
return diagnostics;
}
// Finish (this handles compression finalization and atomic swap)
match ctx.finish() {
Ok(collector) => collector.into_diagnostics(),
Err(diagnostics) => {
let _ = std::fs::remove_file(&temp_path);
diagnostics
}
}
}
/// Stub for when compression feature is not enabled.
#[cfg(not(feature = "compression"))]
fn append_to_compressed_archive<P: AsRef<Path>, Q: AsRef<Path>>(
archive_path: P,
_new_files: &[Q],
_output_path: P,
format: CompressionFormat,
_snapshot_interval: Option<usize>,
) -> Vec<Diagnostic> {
let format_name = match format {
CompressionFormat::Gzip => "gzip",
CompressionFormat::Deflate => "deflate",
CompressionFormat::Zlib => "zlib",
CompressionFormat::Brotli => "brotli",
CompressionFormat::Zstd => "zstd",
CompressionFormat::None => unreachable!(),
};
vec![Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
format!(
"I detected a {}-compressed archive, but this build doesn't support compression.",
format_name
),
)
.with_location(archive_path.as_ref().display().to_string(), 1)
.with_advice(
"This binary was built without compression support.\n\
Install with compression: cargo install json-archive --features compression\n\
Or decompress the file first."
.to_string(),
)]
}
/// Generate default output filename from input filename.
///
/// - `test.json` -> `test.json.archive`
/// - `test.txt` -> `test.txt.json.archive`
/// - `test` -> `test.json.archive`
/// - `test.json.archive` -> `test.json.archive` (unchanged)
pub fn default_output_filename<P: AsRef<Path>>(input_path: P) -> PathBuf {
let path = input_path.as_ref();
let mut output = path.to_path_buf();
// If it already ends with .json.archive, don't modify it
if let Some(filename) = path.file_name() {
if let Some(filename_str) = filename.to_str() {
if filename_str.ends_with(".json.archive") {
return output;
}
}
}
// Add .json.archive extension
if let Some(extension) = path.extension() {
if extension == "json" {
// Replace .json with .json.archive
output.set_extension("json.archive");
} else {
// Append .json.archive to whatever extension exists
let new_extension = format!("{}.json.archive", extension.to_string_lossy());
output.set_extension(new_extension);
}
} else {
// No extension, just add .json.archive
output.set_extension("json.archive");
}
output
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
#[test]
fn test_create_archive_single_file() -> Result<(), Box<dyn std::error::Error>> {
// Create input file
let mut input_file = NamedTempFile::new()?;
writeln!(input_file, r#"{{"count": 0, "name": "test"}}"#)?;
input_file.flush()?;
// Create output file
let output_file = NamedTempFile::new()?;
let diagnostics = create_archive(
&[input_file.path()],
output_file.path(),
Some("test-source".to_string()),
None,
);
assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics);
// Verify the output
let content = std::fs::read_to_string(output_file.path())?;
let header: Header = serde_json::from_str(content.lines().next().unwrap())?;
assert_eq!(header.file_type, "@peoplesgrocers/json-archive");
assert_eq!(header.version, 1);
assert_eq!(header.initial, json!({"count": 0, "name": "test"}));
Ok(())
}
#[test]
fn test_create_archive_multiple_files() -> Result<(), Box<dyn std::error::Error>> {
// Create input files
let mut file1 = NamedTempFile::new()?;
let mut file2 = NamedTempFile::new()?;
writeln!(file1, r#"{{"count": 0}}"#)?;
writeln!(file2, r#"{{"count": 1}}"#)?;
file1.flush()?;
file2.flush()?;
let output_file = NamedTempFile::new()?;
let diagnostics = create_archive(
&[file1.path(), file2.path()],
output_file.path(),
None,
None,
);
assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics);
// Verify output has header + observation events
let content = std::fs::read_to_string(output_file.path())?;
let lines: Vec<&str> = content.lines().collect();
assert!(lines.len() >= 3); // header + comment + observe + change
// First line should be header
let header: Header = serde_json::from_str(lines[0])?;
assert_eq!(header.initial, json!({"count": 0}));
// Should contain observe and change events
assert!(content.contains("observe"));
assert!(content.contains("change"));
assert!(content.contains("/count"));
Ok(())
}
#[test]
fn test_append_to_uncompressed_archive() -> Result<(), Box<dyn std::error::Error>> {
// Create initial archive
let mut archive_file = NamedTempFile::new()?;
let header = Header::new(json!({"count": 0}), None);
writeln!(archive_file, "{}", serde_json::to_string(&header)?)?;
archive_file.flush()?;
// Create file to append
let mut new_file = NamedTempFile::new()?;
writeln!(new_file, r#"{{"count": 1}}"#)?;
new_file.flush()?;
let diagnostics = append_to_archive(
archive_file.path(),
&[new_file.path()],
archive_file.path(),
None,
None,
);
assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics);
// Verify the archive was updated
let content = std::fs::read_to_string(archive_file.path())?;
assert!(content.contains("observe"));
assert!(content.contains("change"));
assert!(content.contains("/count"));
Ok(())
}
#[test]
fn test_default_output_filename() {
assert_eq!(
default_output_filename("test.json"),
PathBuf::from("test.json.archive")
);
assert_eq!(
default_output_filename("test.txt"),
PathBuf::from("test.txt.json.archive")
);
assert_eq!(
default_output_filename("test"),
PathBuf::from("test.json.archive")
);
assert_eq!(
default_output_filename("test.json.archive"),
PathBuf::from("test.json.archive")
);
}
}

View file

@ -29,6 +29,7 @@ use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, Diagno
use crate::event_deserialize::EventDeserializer;
use crate::events::{Event, Header};
use crate::pointer::JsonPointer;
use crate::detection::{CompressionFormat, detect_compression_format};
#[cfg(feature = "compression")]
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
@ -43,16 +44,6 @@ pub enum ReadMode {
AppendSeek,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CompressionFormat {
Gzip,
Deflate,
Zlib,
Brotli,
Zstd,
None,
}
pub struct ArchiveReader {
mode: ReadMode,
filename: String,
@ -99,8 +90,7 @@ impl Iterator for EventIterator {
Ok(d) => d,
Err(e) => {
self.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::InvalidEventJson,
format!("I couldn't parse this line as JSON: {}", e),
)
@ -137,8 +127,7 @@ impl Iterator for EventIterator {
}
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
self.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::InvalidUtf8,
format!("I found invalid UTF-8 bytes at line {}.", self.line_number)
)
@ -157,40 +146,6 @@ impl Iterator for EventIterator {
}
}
fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat {
if bytes.len() < 4 {
return CompressionFormat::None;
}
// Gzip magic number: 0x1f 0x8b
if bytes[0] == 0x1f && bytes[1] == 0x8b {
return CompressionFormat::Gzip;
}
// Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda
if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) {
return CompressionFormat::Zlib;
}
// Zstd magic number: 0x28 0xb5 0x2f 0xfd
if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd {
return CompressionFormat::Zstd;
}
// Check file extension for brotli (no reliable magic number) and deflate
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy();
if ext_str == "br" || path.to_string_lossy().contains(".br.") {
return CompressionFormat::Brotli;
}
if ext_str == "deflate" {
return CompressionFormat::Deflate;
}
}
CompressionFormat::None
}
impl ArchiveReader {
pub fn new<P: AsRef<Path>>(path: P, mode: ReadMode) -> std::io::Result<Self> {
let filename = path.as_ref().display().to_string();
@ -224,8 +179,7 @@ impl ArchiveReader {
};
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
format!("I detected a {}-compressed archive, but this build doesn't support compression.", format_name)
)
@ -271,8 +225,7 @@ impl ArchiveReader {
Ok(0) => {
// Empty file
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::EmptyFile,
"I found an empty file, but I need at least a header line.".to_string(),
)
@ -295,8 +248,7 @@ impl ArchiveReader {
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
// UTF-8 error
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::InvalidUtf8,
"I found invalid UTF-8 bytes at line 1.".to_string()
)
@ -420,8 +372,7 @@ impl ArchiveReader {
&& !seen_observations.contains(&observation_id)
{
event_iter.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::NonExistentObservationId,
format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id)
)
@ -447,8 +398,7 @@ impl ArchiveReader {
&& !seen_observations.contains(&observation_id)
{
event_iter.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::NonExistentObservationId,
format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id)
)
@ -470,8 +420,7 @@ impl ArchiveReader {
&& !seen_observations.contains(&observation_id)
{
event_iter.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::NonExistentObservationId,
format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id)
)
@ -493,8 +442,7 @@ impl ArchiveReader {
&& !seen_observations.contains(&observation_id)
{
event_iter.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::NonExistentObservationId,
format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id)
)
@ -512,8 +460,7 @@ impl ArchiveReader {
Event::Snapshot { observation_id: _, timestamp: _, object } => {
if self.mode == ReadMode::FullValidation && state != object {
event_iter.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::SnapshotStateMismatch,
"I found a snapshot whose state doesn't match the replayed state up to this point.".to_string()
)
@ -566,8 +513,7 @@ impl ArchiveReader {
Ok(v) => v,
Err(e) => {
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::MissingHeader,
format!("I couldn't parse the header as JSON: {}", e),
)
@ -587,8 +533,7 @@ impl ArchiveReader {
Ok(header) => {
if header.version != 1 {
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::UnsupportedVersion,
format!("I found version {}, but I only support version 1.", header.version)
)
@ -606,8 +551,7 @@ impl ArchiveReader {
}
Err(e) => {
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::MissingHeaderField,
format!("I couldn't parse the header: {}", e),
)
@ -666,12 +610,11 @@ pub fn apply_move(
) -> Result<(), Diagnostic> {
let pointer = JsonPointer::new(path)?;
let array = pointer.get(state)?;
let array = pointer.get_mut(state)?;
if !array.is_array() {
return Err(
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::MoveOnNonArray,
format!(
"I can't apply move operations to '{}' because it's not an array.",
@ -686,48 +629,41 @@ pub fn apply_move(
);
}
let mut arr = array.as_array().unwrap().clone();
let arr = array.as_array_mut().unwrap();
for (from_idx, to_idx) in moves {
if from_idx >= arr.len() {
return Err(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MoveIndexOutOfBounds,
format!(
"The 'from' index {} is out of bounds (array length is {}).",
from_idx,
arr.len()
),
)
);
// Validate all moves upfront before mutating
for (from_idx, to_idx) in &moves {
if *from_idx >= arr.len() {
return Err(Diagnostic::fatal(
DiagnosticCode::MoveIndexOutOfBounds,
format!(
"The 'from' index {} is out of bounds (array length is {}).",
from_idx,
arr.len()
),
));
}
if to_idx > arr.len() {
return Err(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MoveIndexOutOfBounds,
format!(
"The 'to' index {} is out of bounds (array length is {}).",
to_idx,
arr.len()
),
)
);
if *to_idx > arr.len() {
return Err(Diagnostic::fatal(
DiagnosticCode::MoveIndexOutOfBounds,
format!(
"The 'to' index {} is out of bounds (array length is {}).",
to_idx,
arr.len()
),
));
}
let element = arr[from_idx].clone();
arr.insert(to_idx, element);
let remove_idx = if from_idx > to_idx {
from_idx + 1
} else {
from_idx
};
arr.remove(remove_idx);
}
pointer.set(state, Value::Array(arr))
// Apply moves now that we know they're all valid
for (from_idx, to_idx) in moves {
let element = arr.remove(from_idx);
let insert_idx = if to_idx > from_idx { to_idx - 1 } else { to_idx };
arr.insert(insert_idx, element);
}
Ok(())
}
#[cfg(test)]

1040
src/archive_writer.rs Normal file

File diff suppressed because it is too large Load diff

253
src/atomic_file.rs Normal file
View file

@ -0,0 +1,253 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
//! Problem: how do you append data to a compressed archive without losing data?
//!
//! Gzip and similar formats don't support in-place append. To add one record
//! to a 20GB archive, you decompress it all, add the record, and recompress.
//!
//! You have two options:
//!
//! Option A: Overwrite in place. Seek to byte 0 of the existing file and start
//! writing the new compressed stream. No extra disk space needed. But if you
//! fail mid-write (out of space, crash, power loss), you've corrupted the
//! original and lost everything. With a 20GB file, that's a lot of time spent
//! in the danger zone.
//!
//! Option B: Write to a new file, then swap. Requires 2x disk space temporarily,
//! but the original stays intact until the new file is complete. If writing
//! fails, you just delete the partial temp file.
//!
//! This module implements option B. I'm not comfortable with option A.
//!
//! The swap sequence:
//! 1. Write new archive to `.archive.json.gz.a7bX2q`
//! 2. Rename original to `.archive.json.gz.a7bX2q.old` (backup)
//! 3. Rename temp to `archive.json.gz` (atomic on same filesystem)
//! 4. Delete backup
//!
//! If writing fails, original is untouched. If the swap fails, we restore
//! from backup. Data loss requires a kernel crash between steps 2 and 3.
//!
//! Assumes everything is on one filesystem. Cross-filesystem renames aren't
//! atomic and we don't handle them.
use std::path::{Path, PathBuf};
use uuid::Uuid;
use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticLevel};
/// Generate a rsync-style temporary filename with dot prefix and random suffix
///
/// For example: "archive.json.gz" -> ".archive.json.gz.a7bX2q"
///
/// The naming convention follows rsync's pattern:
/// - Prefix with `.` to hide the file on Unix systems
/// - Append a 6-character random suffix for uniqueness
pub fn generate_temp_filename<P: AsRef<Path>>(path: P) -> PathBuf {
let path = path.as_ref();
// Generate 6-character random suffix using first 6 hex chars of a uuid
let uuid = Uuid::new_v4();
let hex = format!("{:x}", uuid.as_u128());
let random_suffix = &hex[..6];
// Get the filename
if let Some(filename) = path.file_name() {
if let Some(filename_str) = filename.to_str() {
// Create new filename: .{original}.{random}
let temp_filename = format!(".{}.{}", filename_str, random_suffix);
// Return path with new filename
if let Some(parent) = path.parent() {
return parent.join(temp_filename);
} else {
return PathBuf::from(temp_filename);
}
}
}
// Fallback: just add prefix and suffix to entire path
let mut temp_path = path.to_path_buf();
temp_path.set_file_name(format!(".{}.{}", path.display(), random_suffix));
temp_path
}
/// Atomically replace a file using rsync-style temp files
///
/// This performs the following sequence:
/// 1. Write new content to temp_path (caller's responsibility - already done)
/// 2. Move original_path -> .original_path.{random}.old (backup)
/// 3. Move temp_path -> original_path (replace)
/// 4. Delete .original_path.{random}.old (cleanup)
///
/// If any step fails, attempts to recover by restoring the backup.
///
/// # Arguments
///
/// * `original_path` - The file to be replaced
/// * `temp_path` - The temporary file containing the new content
///
/// # Errors
///
/// Returns diagnostics if any step of the operation fails. The function
/// attempts automatic recovery by restoring the backup if the replacement fails.
pub fn atomic_replace_file<P: AsRef<Path>>(original_path: P, temp_path: P) -> Result<(), Vec<Diagnostic>> {
let original = original_path.as_ref();
let temp = temp_path.as_ref();
// Generate backup filename with same random suffix as temp file
let backup_path = if let Some(filename) = original.file_name() {
if let Some(filename_str) = filename.to_str() {
// Extract random suffix from temp filename if it follows our pattern
let temp_filename = temp.file_name().and_then(|f| f.to_str()).unwrap_or("");
let random_suffix = if temp_filename.starts_with('.') && temp_filename.contains(filename_str) {
// Extract suffix after the original filename
temp_filename.rsplit('.').next().unwrap_or("backup")
} else {
"backup"
};
let backup_filename = format!(".{}.{}.old", filename_str, random_suffix);
if let Some(parent) = original.parent() {
parent.join(backup_filename)
} else {
PathBuf::from(backup_filename)
}
} else {
original.with_extension("old")
}
} else {
original.with_extension("old")
};
// Step 1: Move original to backup
if let Err(e) = std::fs::rename(original, &backup_path) {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't create backup of the original archive: {}", e),
)
.with_advice(
"Make sure you have write permission in this directory and sufficient disk space."
.to_string()
)]);
}
// Step 2: Move temp to original
if let Err(e) = std::fs::rename(temp, original) {
// Recovery: Try to restore backup
let recovery_error = if std::fs::rename(&backup_path, original).is_ok() {
format!(
"I couldn't move the new archive into place: {}\nI've restored the original archive from backup.",
e
)
} else {
format!(
"I couldn't move the new archive into place: {}\nWARNING: I also failed to restore the backup. Your original is at: {}",
e,
backup_path.display()
)
};
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
recovery_error,
)
.with_advice(
"Check filesystem permissions and disk space. If the backup exists, you can manually restore it."
.to_string()
)]);
}
// Step 3: Delete backup
// This is non-critical - if it fails, we just leave the backup around
let _ = std::fs::remove_file(&backup_path);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_generate_temp_filename() {
let temp = generate_temp_filename("archive.json.gz");
let filename = temp.file_name().unwrap().to_str().unwrap();
// Should start with dot
assert!(filename.starts_with('.'));
// Should contain original filename
assert!(filename.contains("archive.json.gz"));
// Should have a random suffix (dot followed by 6 chars)
assert!(filename.matches('.').count() >= 3); // .archive.json.gz has 2, plus 1 before random
}
#[test]
fn test_atomic_replace_file() -> Result<(), Box<dyn std::error::Error>> {
// Create original file
let mut original = NamedTempFile::new()?;
writeln!(original, "original content")?;
original.flush()?;
let original_path = original.path().to_path_buf();
// Create temp file with new content
let temp_path = generate_temp_filename(&original_path);
{
let mut temp_file = File::create(&temp_path)?;
writeln!(temp_file, "new content")?;
}
// Perform atomic replace
atomic_replace_file(&original_path, &temp_path)
.map_err(|e| format!("Failed to replace file: {:?}", e))?;
// Verify new content
let content = std::fs::read_to_string(&original_path)?;
assert_eq!(content.trim(), "new content");
// Verify temp file is gone
assert!(!temp_path.exists());
// Verify backup is cleaned up
let backup_pattern = format!(".{}.", original_path.file_name().unwrap().to_str().unwrap());
let parent = original_path.parent().unwrap();
let backups: Vec<_> = std::fs::read_dir(parent)?
.filter_map(|e| e.ok())
.filter(|e| {
e.file_name()
.to_str()
.map(|s| s.contains(&backup_pattern) && s.ends_with(".old"))
.unwrap_or(false)
})
.collect();
assert_eq!(backups.len(), 0, "Backup file should be cleaned up");
Ok(())
}
}

View file

@ -0,0 +1,135 @@
// Generates documentation for JSON pointer diagnostics.
//
// Run with: cargo run --bin pointer_errors_demo > docs/diagnostics/json-pointer.md
use json_archive::JsonPointer;
use serde_json::json;
fn print_example(pointer_str: &str, value: &mut serde_json::Value) {
println!("```");
let pointer = JsonPointer::new(pointer_str).unwrap();
if let Err(diag) = pointer.get_mut(value) {
print!("{}", diag);
}
println!("```");
}
fn main() {
print!(r#"<!-- Generated by: cargo run --bin pointer_errors_demo > docs/diagnostics/json-pointer.md -->
# JSON Pointer Diagnostics
These are the error messages you'll see when a [JSON Pointer (RFC 6901)](https://datatracker.ietf.org/doc/html/rfc6901)
operation fails.
## Why These Errors Are Limited
The JSON object that failed to index probably doesn't exist anywhere as a file. It's
built by replaying delta events from the archive. The filename and line numbers in
these errors point to the source of the JSON pointer pathsthe add/change/remove
events in the archivenot to the object itself.
A proper solution would dump the reconstructed JSON object to a file so you could
inspect it with `jq` or a text editor. That engineering work didn't happen.
Instead, you get:
- The pointer path that failed, with the failing segment underlined
- The actual value at the parent path (truncated)
- Some strings you can grep for in the archive
This is better than nothing, but it's still awkward. You can see *what* failed but
not easily inspect the full object we tried to index into. If you're lucky, the
truncated value shown is enough. If you're developing on this project, at least
you know what the errors look like.
## Contributing
If an error message is confusing or unhelpful for your case, please open an issue
or submit a pull request.
## Key Not Found
Key doesn't exist in the object. Shows available keys and suggests typos.
"#);
print_example(
"/user/emial",
&mut json!({
"user": {
"name": "Alice",
"email": "alice@example.com",
"age": 30
}
}),
);
print!(r#"
## Type Mismatch
Tried to index into a value that doesn't support it (e.g., `/domain` on a string,
`/0` on a number). Shows the actual type.
"#);
print_example(
"/users/0/email/domain",
&mut json!({
"users": [
{"email": "alice@example.com"}
]
}),
);
print!(r#"
## Array Index Out of Bounds
Index past the end of the array. Shows the array length.
"#);
print_example(
"/items/5",
&mut json!({
"items": ["apple", "banana", "cherry"]
}),
);
print!(r#"
## Array Index
If you think you have an object but you're actually indexing into an array, you'll see this error.
"#);
print_example(
"/items/foo",
&mut json!({
"items": ["apple", "banana", "cherry"]
}),
);
print!(r#"
## Deep Path Failures
For long paths, the underline shows which segment failed. The full path remains
visible so you can see what you were trying to reach.
"#);
print_example(
"/data/users/0/profile/settings/theme",
&mut json!({
"data": {
"users": [
{
"profile": {
"name": "Alice"
}
}
]
}
}),
);
}

View file

@ -31,9 +31,16 @@
//! Design choice by @nobody. No user requests for this, just seemed nice.
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
#[cfg(feature = "compression")]
use brotli::Decompressor;
#[cfg(feature = "compression")]
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
#[cfg(feature = "compression")]
use zstd::stream::read::Decoder as ZstdDecoder;
/// Detects if a file is a JSON archive by checking file extension or inspecting the header.
///
/// Detection strategy:
@ -52,20 +59,65 @@ use std::path::Path;
pub fn is_json_archive<P: AsRef<Path>>(path: P) -> Result<bool, std::io::Error> {
let path = path.as_ref();
// Check file extension first (fast path)
if let Some(filename) = path.file_name() {
if let Some(filename_str) = filename.to_str() {
if filename_str.ends_with(".json.archive") {
// Match .json.archive with any compression suffix
if filename_str.ends_with(".json.archive")
|| filename_str.ends_with(".json.archive.gz")
|| filename_str.ends_with(".json.archive.br")
|| filename_str.ends_with(".json.archive.zst")
|| filename_str.ends_with(".json.archive.zlib")
{
return Ok(true);
}
}
}
let file = File::open(path)?;
let mut reader = BufReader::new(file);
// Open file and detect compression
let mut file = File::open(path)?;
let mut magic_bytes = [0u8; 4];
let bytes_read = file.read(&mut magic_bytes)?;
let compression = detect_compression_format(path, &magic_bytes[..bytes_read]);
// Reopen file to reset position
file = File::open(path)?;
// Create appropriate reader based on compression format
let reader: Box<dyn BufRead> = create_reader(file, compression)?;
check_header_line(reader)
}
/// Create a buffered reader that handles decompression if needed.
#[cfg(feature = "compression")]
fn create_reader(file: File, compression: CompressionFormat) -> Result<Box<dyn BufRead>, std::io::Error> {
Ok(match compression {
CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))),
CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))),
CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))),
CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))),
CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)),
CompressionFormat::None => Box::new(BufReader::new(file)),
})
}
#[cfg(not(feature = "compression"))]
fn create_reader(file: File, compression: CompressionFormat) -> Result<Box<dyn BufRead>, std::io::Error> {
if compression != CompressionFormat::None {
// Without compression support, we can't decompress to check the header.
// Return false by returning an empty reader that will fail header check.
return Ok(Box::new(BufReader::new(std::io::empty())));
}
Ok(Box::new(BufReader::new(file)))
}
/// Check if the first line of the reader contains a valid archive header.
fn check_header_line(mut reader: Box<dyn BufRead>) -> Result<bool, std::io::Error> {
let mut first_line = String::new();
match reader.read_line(&mut first_line) {
Ok(0) => return Ok(false), // Empty file
Ok(0) => Ok(false), // Empty file
Ok(_) => {
// Try to parse as JSON and check if it has our type field as the first key
if let Ok(value) = serde_json::from_str::<serde_json::Value>(&first_line) {
@ -81,11 +133,54 @@ pub fn is_json_archive<P: AsRef<Path>>(path: P) -> Result<bool, std::io::Error>
}
}
}
Ok(false)
}
Err(e) => return Err(e),
Err(e) => Err(e),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionFormat {
Gzip,
Deflate,
Zlib,
Brotli,
Zstd,
None,
}
pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat {
if bytes.len() < 4 {
return CompressionFormat::None;
}
Ok(false)
// Gzip magic number: 0x1f 0x8b
if bytes[0] == 0x1f && bytes[1] == 0x8b {
return CompressionFormat::Gzip;
}
// Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda
if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) {
return CompressionFormat::Zlib;
}
// Zstd magic number: 0x28 0xb5 0x2f 0xfd
if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd {
return CompressionFormat::Zstd;
}
// Check file extension for brotli (no reliable magic number) and deflate
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy();
if ext_str == "br" || path.to_string_lossy().contains(".br.") {
return CompressionFormat::Brotli;
}
if ext_str == "deflate" {
return CompressionFormat::Deflate;
}
}
CompressionFormat::None
}
#[cfg(test)]

View file

@ -188,6 +188,11 @@ impl Diagnostic {
}
}
#[inline]
pub fn fatal(code: DiagnosticCode, description: String) -> Self {
Self::new(DiagnosticLevel::Fatal, code, description)
}
pub fn with_location(mut self, filename: String, line_number: usize) -> Self {
self.filename = Some(filename);
self.line_number = Some(line_number);

View file

@ -24,10 +24,13 @@ use std::path::PathBuf;
xflags::xflags! {
cmd json-archive {
default cmd create {
/// Input JSON files in chronological order (first file determines default output name)
/// Input JSON files in chronological order. If first file is a .json.archive file,
/// appends remaining files to it. Otherwise creates a new archive from all files.
repeated inputs: PathBuf
/// Output archive file path (defaults to first input + .json.archive)
/// Output archive file path. Defaults to <first-input>.json.archive for new archives,
/// or to the archive path itself when appending (in-place update). Use -o to write
/// the result to a different location.
optional -o, --output output: PathBuf
/// Insert snapshot every N observations (optional)

View file

@ -19,7 +19,12 @@
// marxism@peoplesgrocers.com
//
pub mod archive;
pub mod archive_context;
pub mod archive_open;
pub mod archive_ops;
pub mod archive_reader;
pub mod archive_writer;
pub mod atomic_file;
pub mod detection;
pub mod diagnostics;
pub mod diff;
@ -27,13 +32,13 @@ pub mod event_deserialize;
pub mod events;
pub mod flags;
pub mod pointer;
pub mod reader;
mod pointer_errors;
pub use archive::{
pub use archive_writer::{
append_to_archive, create_archive_from_files, default_output_filename, ArchiveBuilder, ArchiveWriter,
};
pub use detection::is_json_archive;
pub use diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel};
pub use events::{Event, Header, Observation};
pub use pointer::JsonPointer;
pub use reader::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, ReadMode, ReadResult};
pub use archive_reader::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, ReadMode, ReadResult};

View file

@ -19,10 +19,8 @@
// marxism@peoplesgrocers.com
//
use json_archive::{
append_to_archive, create_archive_from_files, default_output_filename, is_json_archive, Diagnostic,
DiagnosticCode, DiagnosticLevel,
};
use json_archive::archive_ops::{append_to_archive, create_archive, default_output_filename};
use json_archive::{is_json_archive, Diagnostic, DiagnosticCode, DiagnosticLevel};
use std::path::Path;
use std::process;
@ -46,15 +44,22 @@ fn main() {
fn run(flags: flags::JsonArchive) -> Vec<Diagnostic> {
match flags.subcommand {
flags::JsonArchiveCmd::Create(create_flags) => create_archive(&create_flags),
flags::JsonArchiveCmd::Create(create_flags) => run_create(&create_flags),
flags::JsonArchiveCmd::Info(info_flags) => cmd::info::run(&info_flags),
flags::JsonArchiveCmd::State(state_flags) => cmd::state::run(&state_flags),
}
}
fn create_archive(flags: &flags::Create) -> Vec<Diagnostic> {
struct ParsedCreateArgs {
destination: std::path::PathBuf,
input_files: Vec<std::path::PathBuf>,
}
/// Parse the create command arguments to determine the destination archive and input files.
/// This consolidates all the inferring behavior in one place.
fn parse_create_args(flags: &flags::Create) -> Result<ParsedCreateArgs, Vec<Diagnostic>> {
if flags.inputs.is_empty() {
return vec![Diagnostic::new(
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MissingHeaderField,
"I need at least one JSON file to create an archive, but you didn't provide any."
@ -65,16 +70,53 @@ fn create_archive(flags: &flags::Create) -> Vec<Diagnostic> {
The first file will be used as the initial state, and subsequent files \
will be compared to generate change events."
.to_string(),
)];
)]);
}
let output_path = match &flags.output {
Some(path) => path.clone(),
None => default_output_filename(&flags.inputs[0]),
// Determine the destination archive path
let destination = if let Some(output) = &flags.output {
// Explicitly specified output path
output.clone()
} else if Path::new(&flags.inputs[0]).exists()
&& is_json_archive(&flags.inputs[0]).unwrap_or(false)
{
// First input is an existing archive - use it as destination
flags.inputs[0].clone()
} else {
// Infer from first input
default_output_filename(&flags.inputs[0])
};
// Filter out the destination from input files to avoid read-write conflicts
let input_files: Vec<_> = flags.inputs
.iter()
.filter(|path| {
match (std::fs::canonicalize(path).ok(), std::fs::canonicalize(&destination).ok()) {
(Some(p), Some(d)) => p != d,
_ => true, // Include if canonicalization fails (file doesn't exist yet)
}
})
.cloned()
.collect();
if input_files.is_empty() {
return Err(vec![
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MissingHeaderField,
"No input files remain after filtering out the destination archive.".to_string()
)
.with_advice(
"You specified the output path in the list of input files. This would cause a read-write conflict.\n\
Either remove the output path from inputs, or use a different output path with -o."
.to_string()
)
]);
}
// Validate all input files exist
let mut diagnostics = Vec::new();
for input_path in &flags.inputs {
for input_path in &input_files {
if !Path::new(input_path).exists() {
diagnostics.push(
Diagnostic::new(
@ -92,43 +134,21 @@ fn create_archive(flags: &flags::Create) -> Vec<Diagnostic> {
}
if !diagnostics.is_empty() {
return diagnostics;
return Err(diagnostics);
}
let first_is_archive = match is_json_archive(&flags.inputs[0]) {
Ok(is_archive) => is_archive,
Err(e) => {
return vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't check if the first file is an archive: {}", e),
)];
}
Ok(ParsedCreateArgs {
destination,
input_files,
})
}
fn run_create(flags: &flags::Create) -> Vec<Diagnostic> {
let parsed = match parse_create_args(flags) {
Ok(parsed) => parsed,
Err(diagnostics) => return diagnostics,
};
if first_is_archive {
println!("First input appears to be a JSON archive file");
if flags.inputs.len() == 1 {
return vec![
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::MissingHeaderField,
"I found that the first input is already an archive file, but you didn't provide any additional JSON files to append.".to_string()
)
.with_advice(
"If you want to append to an archive, provide additional JSON files:\n\
json-archive existing.json.archive new1.json new2.json"
.to_string()
)
];
}
return append_to_archive(&flags.inputs[0], &flags.inputs[1..], &output_path, flags.source.clone(), flags.snapshot_interval);
}
println!("Creating archive: {}", output_path.display());
println!("Input files: {:?}", flags.inputs);
if let Some(interval) = flags.snapshot_interval {
println!("Snapshot interval: every {} observations", interval);
}
@ -137,16 +157,42 @@ fn create_archive(flags: &flags::Create) -> Vec<Diagnostic> {
println!("Source: {}", source);
}
match create_archive_from_files(
&flags.inputs,
output_path.clone(),
// If destination exists and is an archive, append to it
if Path::new(&parsed.destination).exists() {
if let Ok(true) = is_json_archive(&parsed.destination) {
println!("Appending to existing archive: {}", parsed.destination.display());
println!("Input files: {:?}", parsed.input_files);
let diagnostics = append_to_archive(
&parsed.destination,
&parsed.input_files,
&parsed.destination,
flags.source.clone(),
flags.snapshot_interval,
);
if diagnostics.is_empty() {
println!("Archive updated successfully: {}", parsed.destination.display());
}
return diagnostics;
}
}
// Otherwise create a new archive from the input files
println!("Creating new archive: {}", parsed.destination.display());
println!("Input files: {:?}", parsed.input_files);
let diagnostics = create_archive(
&parsed.input_files,
parsed.destination.clone(),
flags.source.clone(),
flags.snapshot_interval,
) {
Ok(()) => {
println!("Archive created successfully: {}", output_path.display());
Vec::new()
}
Err(diagnostics) => diagnostics,
);
if diagnostics.is_empty() {
println!("Archive created successfully: {}", parsed.destination.display());
}
diagnostics
}

View file

@ -19,7 +19,11 @@
// marxism@peoplesgrocers.com
//
use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticLevel};
use crate::diagnostics::{Diagnostic, DiagnosticCode};
use crate::pointer_errors::{
build_array_index_out_of_bounds_error, build_invalid_array_index_error,
build_key_not_found_error, build_type_mismatch_error,
};
use serde_json::Value;
#[derive(Debug, Clone, PartialEq)]
@ -34,8 +38,7 @@ impl JsonPointer {
}
if !path.starts_with('/') {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
return Err(Diagnostic::fatal(
DiagnosticCode::InvalidPointerSyntax,
format!(
"I couldn't parse the path '{}': Path must start with '/'",
@ -52,49 +55,52 @@ impl JsonPointer {
Ok(JsonPointer { tokens })
}
pub fn get<'a>(&self, value: &'a Value) -> Result<&'a Value, Diagnostic> {
/// Traverse the JSON value following this pointer, returning a mutable reference.
///
/// Errors include rich context: the full path, which segment failed, the value
/// at that point, and suggestions for typos. See `pointer_errors` module for details.
pub fn get_mut<'a>(&self, value: &'a mut Value) -> Result<&'a mut Value, Diagnostic> {
let mut current = value;
for token in &self.tokens {
for (token_index, token) in self.tokens.iter().enumerate() {
match current {
Value::Object(obj) => {
current = obj.get(token).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't find the key '{}'", token),
)
})?;
if obj.contains_key(token) {
current = obj.get_mut(token).unwrap();
} else {
let keys: Vec<String> = obj.keys().cloned().collect();
let key_refs: Vec<&str> = keys.iter().map(|s| s.as_str()).collect();
return Err(build_key_not_found_error(
&self.tokens,
token_index,
token,
&key_refs,
));
}
}
Value::Array(arr) => {
let arr_len = arr.len();
let index = token.parse::<usize>().map_err(|_| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidArrayIndex,
format!("I couldn't parse '{}' as an array index", token),
)
})?;
current = arr.get(index).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!(
"I couldn't find index {} (array length is {})",
index,
arr.len()
),
)
build_invalid_array_index_error(&self.tokens, token_index, token, arr)
})?;
if index < arr_len {
current = &mut arr[index];
} else {
return Err(build_array_index_out_of_bounds_error(
&self.tokens,
token_index,
index,
arr_len,
arr,
));
}
}
_ => {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::TypeMismatch,
format!(
"I can't index into {} with '{}'",
current.type_name(),
token
),
return Err(build_type_mismatch_error(
&self.tokens,
token_index,
token,
current,
));
}
}
@ -103,68 +109,32 @@ impl JsonPointer {
Ok(current)
}
/// Returns the parent pointer (all tokens except the last).
///
/// Used by `set` and `remove`: to modify a value, we need a mutable reference
/// to its parent container (object or array), then operate on the final key/index.
fn parent(&self) -> JsonPointer {
JsonPointer {
tokens: self.tokens[..self.tokens.len() - 1].to_vec(),
}
}
pub fn set(&self, value: &mut Value, new_value: Value) -> Result<(), Diagnostic> {
if self.tokens.is_empty() {
*value = new_value;
return Ok(());
}
let mut current = value;
let last_token = &self.tokens[self.tokens.len() - 1];
let parent = self.parent().get_mut(value)?;
for token in &self.tokens[..self.tokens.len() - 1] {
match current {
Value::Object(obj) => {
current = obj.get_mut(token).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't find the key '{}'", token),
)
})?;
}
Value::Array(arr) => {
let index = token.parse::<usize>().map_err(|_| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidArrayIndex,
format!("I couldn't parse '{}' as an array index", token),
)
})?;
let array_len = arr.len();
current = arr.get_mut(index).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!(
"I couldn't find index {} (array length is {})",
index, array_len
),
)
})?;
}
_ => {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::TypeMismatch,
format!(
"I can't index into {} with '{}'",
current.type_name(),
token
),
));
}
}
}
match current {
match parent {
Value::Object(obj) => {
obj.insert(last_token.clone(), new_value);
}
Value::Array(arr) => {
let index = last_token.parse::<usize>().map_err(|_| {
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::InvalidArrayIndex,
format!("I couldn't parse '{}' as an array index", last_token),
)
@ -175,8 +145,7 @@ impl JsonPointer {
} else if index < arr.len() {
arr[index] = new_value;
} else {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
return Err(Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!(
"I couldn't set index {} (array length is {})",
@ -187,13 +156,12 @@ impl JsonPointer {
}
}
_ => {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
return Err(Diagnostic::fatal(
DiagnosticCode::TypeMismatch,
format!(
"I can't set property '{}' on {}",
last_token,
current.type_name()
parent.type_name()
),
));
}
@ -204,73 +172,25 @@ impl JsonPointer {
pub fn remove(&self, value: &mut Value) -> Result<Value, Diagnostic> {
if self.tokens.is_empty() {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
return Err(Diagnostic::fatal(
DiagnosticCode::InvalidPointerSyntax,
"I can't remove the root value".to_string(),
));
}
let mut current = value;
let last_token = &self.tokens[self.tokens.len() - 1];
let parent = self.parent().get_mut(value)?;
for token in &self.tokens[..self.tokens.len() - 1] {
match current {
Value::Object(obj) => {
current = obj.get_mut(token).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!("I couldn't find the key '{}'", token),
)
})?;
}
Value::Array(arr) => {
let index = token.parse::<usize>().map_err(|_| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidArrayIndex,
format!("I couldn't parse '{}' as an array index", token),
)
})?;
let array_len = arr.len();
current = arr.get_mut(index).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::PathNotFound,
format!(
"I couldn't find index {} (array length is {})",
index, array_len
),
)
})?;
}
_ => {
return Err(Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::TypeMismatch,
format!(
"I can't index into {} with '{}'",
current.type_name(),
token
),
));
}
}
}
match current {
match parent {
Value::Object(obj) => obj.remove(last_token).ok_or_else(|| {
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!("I couldn't find the key '{}' to remove", last_token),
)
}),
Value::Array(arr) => {
let index = last_token.parse::<usize>().map_err(|_| {
Diagnostic::new(
DiagnosticLevel::Fatal,
Diagnostic::fatal(
DiagnosticCode::InvalidArrayIndex,
format!("I couldn't parse '{}' as an array index", last_token),
)
@ -279,8 +199,7 @@ impl JsonPointer {
if index < arr.len() {
Ok(arr.remove(index))
} else {
Err(Diagnostic::new(
DiagnosticLevel::Fatal,
Err(Diagnostic::fatal(
DiagnosticCode::PathNotFound,
format!(
"I couldn't remove index {} (array length is {})",
@ -290,13 +209,12 @@ impl JsonPointer {
))
}
}
_ => Err(Diagnostic::new(
DiagnosticLevel::Fatal,
_ => Err(Diagnostic::fatal(
DiagnosticCode::TypeMismatch,
format!(
"I can't remove property '{}' from {}",
last_token,
current.type_name()
parent.type_name()
),
)),
}
@ -342,40 +260,40 @@ mod tests {
#[test]
fn test_empty_pointer() {
let pointer = JsonPointer::new("").unwrap();
let value = json!({"foo": "bar"});
assert_eq!(pointer.get(&value).unwrap(), &value);
let mut value = json!({"foo": "bar"});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!({"foo": "bar"}));
}
#[test]
fn test_simple_object_access() {
let pointer = JsonPointer::new("/foo").unwrap();
let value = json!({"foo": "bar"});
assert_eq!(pointer.get(&value).unwrap(), &json!("bar"));
let mut value = json!({"foo": "bar"});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!("bar"));
}
#[test]
fn test_nested_object_access() {
let pointer = JsonPointer::new("/foo/bar").unwrap();
let value = json!({"foo": {"bar": "baz"}});
assert_eq!(pointer.get(&value).unwrap(), &json!("baz"));
let mut value = json!({"foo": {"bar": "baz"}});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!("baz"));
}
#[test]
fn test_array_access() {
let pointer = JsonPointer::new("/items/0").unwrap();
let value = json!({"items": ["first", "second"]});
assert_eq!(pointer.get(&value).unwrap(), &json!("first"));
let mut value = json!({"items": ["first", "second"]});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!("first"));
}
#[test]
fn test_escape_sequences() {
let pointer = JsonPointer::new("/foo~1bar").unwrap();
let value = json!({"foo/bar": "baz"});
assert_eq!(pointer.get(&value).unwrap(), &json!("baz"));
let mut value = json!({"foo/bar": "baz"});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!("baz"));
let pointer = JsonPointer::new("/foo~0bar").unwrap();
let value = json!({"foo~bar": "baz"});
assert_eq!(pointer.get(&value).unwrap(), &json!("baz"));
let mut value = json!({"foo~bar": "baz"});
assert_eq!(pointer.get_mut(&mut value).unwrap(), &json!("baz"));
}
#[test]

414
src/pointer_errors.rs Normal file
View file

@ -0,0 +1,414 @@
// json-archive is a tool for tracking JSON file changes over time
// Copyright (C) 2025 Peoples Grocers LLC
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// To purchase a license under different terms contact admin@peoplesgrocers.com
// To request changes, report bugs, or give user feedback contact
// marxism@peoplesgrocers.com
//
//! Rich error reporting for JSON Pointer operations.
//!
//! # Why this module exists
//!
//! When a JSON Pointer operation fails (e.g., key not found, type mismatch),
//! the user needs enough context to understand what went wrong. A message like
//! "key 'emial' not found" isn't helpful without knowing:
//!
//! - What was the full path being traversed?
//! - Which segment of the path failed?
//! - What does the value at that point actually look like?
//! - Did they maybe have a typo?
//!
//! This module builds diagnostic messages that answer these questions.
//!
//! # Why errors don't include filename/line number
//!
//! JsonPointer operates on JSON values extracted from a larger JSON Lines archive
//! file. The pointer doesn't know which line of the archive the value came from.
//! The caller (typically `reader.rs`) attaches location info via `.with_location()`:
//!
//! ```ignore
//! if let Err(diag) = pointer.set(&mut state, value) {
//! collector.add(diag.with_location(filename.clone(), line_number));
//! }
//! ```
//!
//! # Why set/remove navigate to parent first
//!
//! For `get_mut`, we traverse the entire path and return the value at the end.
//!
//! For `set` and `remove`, we need to modify a container (object or array), not
//! the value itself. To insert a key into an object or remove an element from an
//! array, we need a mutable reference to the parent container. So we:
//!
//! 1. Navigate to the parent of the target path (all tokens except the last)
//! 2. Then operate on the last token against that parent
//!
//! This means errors can occur in two places:
//! - During parent traversal (handled by `get_mut`'s error reporting)
//! - When operating on the final token (e.g., index out of bounds on the array)
use crate::diagnostics::{Diagnostic, DiagnosticCode};
use serde_json::Value;
use std::cmp::min;
const MAX_STRING_DISPLAY_LEN: usize = 50;
/// Format a JSON value compactly for error display.
/// - Strings: truncated to MAX_STRING_DISPLAY_LEN chars
/// - Objects: show keys with formatted values, nested objects as {...}
/// - Arrays: show indices with formatted values, nested arrays as [...]
pub fn format_value_compact(value: &Value) -> Vec<String> {
match value {
Value::Null => vec!["null".to_string()],
Value::Bool(b) => vec![b.to_string()],
Value::Number(n) => vec![n.to_string()],
Value::String(s) => {
if s.len() > MAX_STRING_DISPLAY_LEN {
vec![format!("\"{}...\"", &s[..MAX_STRING_DISPLAY_LEN])]
} else {
vec![format!("\"{}\"", s)]
}
}
Value::Array(arr) => arr
.iter()
.enumerate()
.map(|(i, v)| format!("{}: {}", i, format_value_inline(v)))
.collect(),
Value::Object(obj) => obj
.iter()
.map(|(k, v)| format!("\"{}\": {}", k, format_value_inline(v)))
.collect(),
}
}
/// Format a value for inline display (single token).
/// Nested structures become {...} or [...].
fn format_value_inline(value: &Value) -> String {
match value {
Value::Null => "null".to_string(),
Value::Bool(b) => b.to_string(),
Value::Number(n) => n.to_string(),
Value::String(s) => {
if s.len() > MAX_STRING_DISPLAY_LEN {
format!("\"{}...\"", &s[..MAX_STRING_DISPLAY_LEN])
} else {
format!("\"{}\"", s)
}
}
Value::Array(_) => "[...]".to_string(),
Value::Object(_) => "{...}".to_string(),
}
}
/// Format the path with underline showing which segment failed.
/// Returns (path_line, underline_line).
pub fn format_path_with_underline(tokens: &[String], failed_index: usize) -> (String, String) {
if tokens.is_empty() {
return ("(root)".to_string(), "^^^^^^".to_string());
}
let mut path = String::new();
let mut underline = String::new();
for (i, token) in tokens.iter().enumerate() {
let escaped = token.replace("~", "~0").replace("/", "~1");
path.push('/');
underline.push(' '); // space for the '/'
if i == failed_index {
underline.push_str(&"^".repeat(escaped.len()));
} else {
underline.push_str(&" ".repeat(escaped.len()));
}
path.push_str(&escaped);
}
(path, underline)
}
/// Build the path string for tokens up to (but not including) the given index.
pub fn path_up_to(tokens: &[String], index: usize) -> String {
if index == 0 {
return "(root)".to_string();
}
let prefix: Vec<String> = tokens[..index]
.iter()
.map(|t| t.replace("~", "~0").replace("/", "~1"))
.collect();
format!("/{}", prefix.join("/"))
}
/// Calculate Levenshtein distance between two strings.
fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev_row: Vec<usize> = (0..=b_len).collect();
let mut curr_row: Vec<usize> = vec![0; b_len + 1];
for i in 1..=a_len {
curr_row[0] = i;
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
curr_row[j] = min(
min(prev_row[j] + 1, curr_row[j - 1] + 1),
prev_row[j - 1] + cost,
);
}
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[b_len]
}
/// Find similar keys in a list (edit distance ≤ 2).
pub fn find_similar_keys<'a>(target: &str, keys: &[&'a str]) -> Vec<&'a str> {
keys.iter()
.filter(|k| levenshtein_distance(target, k) <= 2)
.copied()
.collect()
}
// =============================================================================
// Error builders for get_mut traversal errors
// =============================================================================
pub fn build_key_not_found_error(
tokens: &[String],
token_index: usize,
token: &str,
keys: &[&str],
) -> Diagnostic {
let full_path = tokens_to_path(tokens);
let (path_line, underline) = format_path_with_underline(tokens, token_index);
let parent_path = path_up_to(tokens, token_index);
let description = format!(
"I was traversing the JSON path '{}' and got stuck.\n\n\
I couldn't find the key '{}'.",
full_path, token
);
let mut snippet_lines = vec![format!(" {}", path_line), format!(" {}", underline)];
snippet_lines.push(String::new());
snippet_lines.push(format!("Value at '{}':", parent_path));
for key in keys {
snippet_lines.push(format!("\"{}\": ...", key));
}
let mut advice_parts = vec![format!("Available keys: {}", keys.join(", "))];
let similar = find_similar_keys(token, keys);
if !similar.is_empty() {
advice_parts.push(format!("Did you mean '{}'?", similar[0]));
}
Diagnostic::fatal(DiagnosticCode::PathNotFound, description)
.with_snippet(snippet_lines.join("\n"))
.with_advice(advice_parts.join("\n"))
}
pub fn build_invalid_array_index_error(
tokens: &[String],
token_index: usize,
token: &str,
arr: &[Value],
) -> Diagnostic {
let full_path = tokens_to_path(tokens);
let (path_line, underline) = format_path_with_underline(tokens, token_index);
let parent_path = path_up_to(tokens, token_index);
let description = format!(
"I was traversing the JSON path '{}' and got stuck.\n\n\
I couldn't parse '{}' as an array index.",
full_path, token
);
let mut snippet_lines = vec![format!(" {}", path_line), format!(" {}", underline)];
snippet_lines.push(String::new());
snippet_lines.push(format!("Value at '{}':", parent_path));
for line in format_value_compact(&Value::Array(arr.to_vec())) {
snippet_lines.push(format!("{}", line));
}
let advice = format!(
"Array indices must be non-negative integers. Got '{}'.",
token
);
Diagnostic::fatal(DiagnosticCode::InvalidArrayIndex, description)
.with_snippet(snippet_lines.join("\n"))
.with_advice(advice)
}
pub fn build_array_index_out_of_bounds_error(
tokens: &[String],
token_index: usize,
index: usize,
arr_len: usize,
arr: &[Value],
) -> Diagnostic {
let full_path = tokens_to_path(tokens);
let (path_line, underline) = format_path_with_underline(tokens, token_index);
let parent_path = path_up_to(tokens, token_index);
let description = format!(
"I was traversing the JSON path '{}' and got stuck.\n\n\
I couldn't find index {} (array length is {}).",
full_path, index, arr_len
);
let mut snippet_lines = vec![format!(" {}", path_line), format!(" {}", underline)];
snippet_lines.push(String::new());
snippet_lines.push(format!("Value at '{}':", parent_path));
for line in format_value_compact(&Value::Array(arr.to_vec())) {
snippet_lines.push(format!("{}", line));
}
let advice = if arr_len == 0 {
"The array is empty.".to_string()
} else {
format!("Valid indices are 0-{}.", arr_len - 1)
};
Diagnostic::fatal(DiagnosticCode::PathNotFound, description)
.with_snippet(snippet_lines.join("\n"))
.with_advice(advice)
}
pub fn build_type_mismatch_error(
tokens: &[String],
token_index: usize,
token: &str,
current: &Value,
) -> Diagnostic {
let full_path = tokens_to_path(tokens);
let (path_line, underline) = format_path_with_underline(tokens, token_index);
let parent_path = path_up_to(tokens, token_index);
let type_name = value_type_name(current);
let description = format!(
"I was traversing the JSON path '{}' and got stuck.\n\n\
I can't index into {} with '{}'.",
full_path, type_name, token
);
let mut snippet_lines = vec![format!(" {}", path_line), format!(" {}", underline)];
snippet_lines.push(String::new());
snippet_lines.push(format!("Value at '{}':", parent_path));
for line in format_value_compact(current) {
snippet_lines.push(format!("{}", line));
}
let advice = if token.parse::<usize>().is_ok() {
format!(
"Array indices like '/{}' only work on arrays, not {}.",
token, type_name
)
} else {
format!(
"Object keys like '/{}' only work on objects, not {}.",
token, type_name
)
};
Diagnostic::fatal(DiagnosticCode::TypeMismatch, description)
.with_snippet(snippet_lines.join("\n"))
.with_advice(advice)
}
// =============================================================================
// Helpers
// =============================================================================
fn tokens_to_path(tokens: &[String]) -> String {
if tokens.is_empty() {
return "".to_string();
}
let escaped: Vec<String> = tokens
.iter()
.map(|t| t.replace("~", "~0").replace("/", "~1"))
.collect();
format!("/{}", escaped.join("/"))
}
fn value_type_name(value: &Value) -> &'static str {
match value {
Value::Null => "null",
Value::Bool(_) => "boolean",
Value::Number(_) => "number",
Value::String(_) => "string",
Value::Array(_) => "array",
Value::Object(_) => "object",
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_key_not_found_error_output() {
let tokens = vec!["user".to_string(), "emial".to_string()];
let keys = vec!["name", "email", "age"];
let diag = build_key_not_found_error(&tokens, 1, "emial", &keys);
println!("\n--- Key not found error ---");
println!("{}", diag);
}
#[test]
fn test_type_mismatch_error_output() {
let tokens = vec!["users".to_string(), "0".to_string(), "email".to_string(), "domain".to_string()];
let current = Value::String("alice@example.com".to_string());
let diag = build_type_mismatch_error(&tokens, 3, "domain", &current);
println!("\n--- Type mismatch error ---");
println!("{}", diag);
}
#[test]
fn test_array_out_of_bounds_error_output() {
let tokens = vec!["items".to_string(), "5".to_string()];
let arr = vec![
Value::String("apple".to_string()),
Value::String("banana".to_string()),
Value::String("cherry".to_string()),
];
let diag = build_array_index_out_of_bounds_error(&tokens, 1, 5, 3, &arr);
println!("\n--- Array out of bounds error ---");
println!("{}", diag);
}
}

View file

@ -0,0 +1,64 @@
// Integration tests for compressed archive functionality
use json_archive::{append_to_archive, ArchiveWriter, Header};
use json_archive::{ArchiveReader, ReadMode};
use serde_json::json;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
#[cfg(feature = "compression")]
fn test_append_to_compressed_archive_basic() -> Result<(), Box<dyn std::error::Error>> {
use flate2::write::GzEncoder;
use flate2::Compression;
// Create initial archive
let archive_file = NamedTempFile::with_suffix(".json.archive")?;
let header = Header::new(json!({"count": 0}), Some("test".to_string()));
{
let mut writer = ArchiveWriter::new(archive_file.path(), None)
.map_err(|e| format!("Failed to create writer: {:?}", e))?;
writer.write_header(&header)
.map_err(|e| format!("Failed to write header: {:?}", e))?;
writer.finish()
.map_err(|e| format!("Failed to finish: {:?}", e))?;
}
// Compress it
let compressed_file = NamedTempFile::with_suffix(".json.archive.gz")?;
{
let input = std::fs::read(archive_file.path())?;
let mut encoder = GzEncoder::new(
compressed_file.as_file().try_clone()?,
Compression::default()
);
encoder.write_all(&input)?;
encoder.finish()?;
}
// Create a new state file to append
let mut state_file = NamedTempFile::new()?;
writeln!(state_file, r#"{{"count": 1}}"#)?;
state_file.flush()?;
// Append to compressed archive
let diagnostics = append_to_archive(
compressed_file.path(),
&[state_file.path()],
compressed_file.path(),
None,
None,
);
// Should succeed with no diagnostics
assert!(diagnostics.is_empty(), "Got diagnostics: {:?}", diagnostics);
// Verify the archive was updated (decompressed)
let reader = ArchiveReader::new(compressed_file.path(), ReadMode::FullValidation)?;
let result = reader.read(compressed_file.path())?;
assert_eq!(result.final_state, json!({"count": 1}));
assert_eq!(result.observation_count, 1);
Ok(())
}

View file

@ -0,0 +1,78 @@
# Compression Integration Tests
Manual integration tests for compressed archive functionality.
These scripts exercise the tool's ability to:
1. Read archives that were compressed by external programs (gzip, brotli, zstd)
2. Append new observations to compressed archives
3. Produce correct results whether reading compressed or uncompressed
## Scripts
### `generate_state.py <n>`
Generates a JSON state file with `n` items in each array. Output goes to stdout.
```bash
./generate_state.py 3
# Output: {"colors":["color_1","color_2","color_3"],"numbers":["number_1","number_2","number_3"],"animals":["animal_1","animal_2","animal_3"]}
```
### `generate_state_files.py <count> <output_dir>`
Generates a series of state files (state_1.json through state_N.json) with progressively more items.
```bash
./generate_state_files.py 9 ./data
# Creates: data/state_1.json, data/state_2.json, ... data/state_9.json
```
### `run_gzip_test.sh`
Tests the gzip compression workflow:
1. Create archive from first state file
2. Compress with gzip
3. Append remaining 8 state files to the compressed archive
4. Decompress and inspect
### `run_brotli_test.sh`
Same workflow but with brotli compression.
### `run_zstd_test.sh`
Same workflow but with zstd compression.
### `run_all.sh`
Runs all compression tests in sequence.
### `validate.sh` (optional)
Smoke test to verify the final state matches expectations.
## Usage
```bash
cd tests/compression-integration
# Run all tests (generates data, builds, runs all compression formats)
./run_all.sh
# Or run individual steps:
./generate_state_files.py 9 ./data
./run_gzip_test.sh
./run_brotli_test.sh
./run_zstd_test.sh
# Optional: validate outputs match
./validate.sh
```
## What to look for
After running the tests, you can manually verify:
1. The compressed archives were created
2. Appending to compressed archives worked (check file sizes grew)
3. The `info` command shows the same observation count for compressed and decompressed versions
4. The `state` command returns the same final state
## Dependencies
- gzip (usually pre-installed)
- brotli (`brew install brotli`)
- zstd (`brew install zstd`)

View file

@ -0,0 +1,28 @@
#!/usr/bin/env python3
"""
Generate a JSON state file with N items in each array.
Output goes to stdout.
Usage: ./generate_state.py <n>
"""
import json
import sys
def main():
if len(sys.argv) != 2:
print("Usage: generate_state.py <n>", file=sys.stderr)
sys.exit(1)
n = int(sys.argv[1])
state = {
"colors": [f"color_{i}" for i in range(1, n + 1)],
"numbers": [f"number_{i}" for i in range(1, n + 1)],
"animals": [f"animal_{i}" for i in range(1, n + 1)],
}
print(json.dumps(state))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""
Generate a series of state files with progressively more items.
Usage: ./generate_state_files.py <count> <output_dir>
Creates: output_dir/state_1.json, state_2.json, ..., state_N.json
"""
import json
import os
import sys
def generate_state(n):
return {
"colors": [f"color_{i}" for i in range(1, n + 1)],
"numbers": [f"number_{i}" for i in range(1, n + 1)],
"animals": [f"animal_{i}" for i in range(1, n + 1)],
}
def main():
if len(sys.argv) != 3:
print("Usage: generate_state_files.py <count> <output_dir>", file=sys.stderr)
sys.exit(1)
count = int(sys.argv[1])
output_dir = sys.argv[2]
os.makedirs(output_dir, exist_ok=True)
for i in range(1, count + 1):
state = generate_state(i)
path = os.path.join(output_dir, f"state_{i}.json")
with open(path, "w") as f:
json.dump(state, f)
print(f"Created {path}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,33 @@
#!/usr/bin/env bash
#
# Run all compression integration tests.
#
# Usage: ./run_all.sh
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
echo "=== Building json-archive with compression support ==="
cd "$PROJECT_DIR"
cargo build --features compression
echo ""
echo "=== Generating test data ==="
cd "$SCRIPT_DIR"
python3 generate_state_files.py 9 ./data
echo ""
"$SCRIPT_DIR/run_gzip_test.sh"
echo ""
"$SCRIPT_DIR/run_brotli_test.sh"
echo ""
"$SCRIPT_DIR/run_zstd_test.sh"
echo ""
echo "=== All tests complete ==="
echo "Output files are in: $SCRIPT_DIR/out/"

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
#
# Test brotli compression workflow:
# 1. Create archive from first state file
# 2. Compress with brotli
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/brotli"
echo "=== Brotli Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with brotli
echo "Compressing with brotli..."
brotli "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.br" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.br"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
brotli -d -k "$OUT_DIR/test.json.archive.br"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
#
# Test gzip compression workflow:
# 1. Create archive from first state file
# 2. Compress with gzip
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/gzip"
echo "=== Gzip Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with gzip
echo "Compressing with gzip..."
gzip "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.gz" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.gz"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
gunzip -k "$OUT_DIR/test.json.archive.gz" 2>/dev/null || gunzip -c "$OUT_DIR/test.json.archive.gz" > "$OUT_DIR/test.json.archive"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,56 @@
#!/usr/bin/env bash
#
# Test zstd compression workflow:
# 1. Create archive from first state file
# 2. Compress with zstd
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/zstd"
echo "=== Zstd Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with zstd
echo "Compressing with zstd..."
zstd "$OUT_DIR/test.json.archive"
rm "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.zst" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.zst"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
zstd -d -k "$OUT_DIR/test.json.archive.zst"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,63 @@
#!/usr/bin/env bash
#
# Validate that compressed and decompressed archives produce the same results.
# Run this after run_all.sh to smoke test the outputs.
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
echo "=== Validation ==="
errors=0
for format in gzip brotli zstd; do
dir="$SCRIPT_DIR/out/$format"
if [ ! -d "$dir" ]; then
echo "SKIP: $format (no output directory)"
continue
fi
# Find the compressed and uncompressed files
compressed=$(find "$dir" -name "*.gz" -o -name "*.br" -o -name "*.zst" | head -1)
uncompressed="$dir/test.json.archive"
if [ ! -f "$compressed" ] || [ ! -f "$uncompressed" ]; then
echo "SKIP: $format (missing files)"
continue
fi
# Compare state output
state_compressed=$("$BINARY" state "$compressed")
state_uncompressed=$("$BINARY" state "$uncompressed")
if [ "$state_compressed" = "$state_uncompressed" ]; then
echo "OK: $format - state matches"
else
echo "FAIL: $format - state differs"
errors=$((errors + 1))
fi
# Compare observation count from info
count_compressed=$("$BINARY" info "$compressed" --output json | python3 -c "import sys,json; print(json.load(sys.stdin)['observation_count'])")
count_uncompressed=$("$BINARY" info "$uncompressed" --output json | python3 -c "import sys,json; print(json.load(sys.stdin)['observation_count'])")
if [ "$count_compressed" = "$count_uncompressed" ]; then
echo "OK: $format - observation count matches ($count_compressed)"
else
echo "FAIL: $format - observation count differs ($count_compressed vs $count_uncompressed)"
errors=$((errors + 1))
fi
done
echo ""
if [ $errors -eq 0 ]; then
echo "All validations passed."
else
echo "$errors validation(s) failed."
exit 1
fi