feat: add reading compressed archive files

This commit is contained in:
nobody 2025-09-30 11:19:13 -07:00
commit 1f0f41a96c
Signed by: GrocerPublishAgent
GPG key ID: 43B1C298CDDE181C
8 changed files with 552 additions and 272 deletions

119
Cargo.lock generated
View file

@ -2,6 +2,27 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 4 version = 4
[[package]]
name = "adler2"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "alloc-no-stdlib"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
[[package]]
name = "alloc-stdlib"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
dependencies = [
"alloc-no-stdlib",
]
[[package]] [[package]]
name = "android_system_properties" name = "android_system_properties"
version = "0.1.5" version = "0.1.5"
@ -32,6 +53,27 @@ version = "2.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
[[package]]
name = "brotli"
version = "8.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
name = "brotli-decompressor"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
]
[[package]] [[package]]
name = "bumpalo" name = "bumpalo"
version = "3.19.0" version = "3.19.0"
@ -45,6 +87,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1354349954c6fc9cb0deab020f27f783cf0b604e8bb754dc4658ecf0d29c35f" checksum = "e1354349954c6fc9cb0deab020f27f783cf0b604e8bb754dc4658ecf0d29c35f"
dependencies = [ dependencies = [
"find-msvc-tools", "find-msvc-tools",
"jobserver",
"libc",
"shlex", "shlex",
] ]
@ -74,6 +118,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crc32fast"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "derive_arbitrary" name = "derive_arbitrary"
version = "1.4.2" version = "1.4.2"
@ -107,6 +160,16 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959" checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
[[package]]
name = "flate2"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.3.3" version = "0.3.3"
@ -149,6 +212,16 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jobserver"
version = "0.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
dependencies = [
"getrandom",
"libc",
]
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.81" version = "0.3.81"
@ -164,12 +237,15 @@ name = "json-archive"
version = "0.99.0" version = "0.99.0"
dependencies = [ dependencies = [
"arbitrary", "arbitrary",
"brotli",
"chrono", "chrono",
"flate2",
"serde", "serde",
"serde_json", "serde_json",
"tempfile", "tempfile",
"uuid", "uuid",
"xflags", "xflags",
"zstd",
] ]
[[package]] [[package]]
@ -196,6 +272,15 @@ version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "miniz_oxide"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.19" version = "0.2.19"
@ -211,6 +296,12 @@ version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "pkg-config"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.101" version = "1.0.101"
@ -516,3 +607,31 @@ name = "xflags-macros"
version = "0.3.2" version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "672423d4fea7ffa2f6c25ba60031ea13dc6258070556f125cc4d790007d4a155" checksum = "672423d4fea7ffa2f6c25ba60031ea13dc6258070556f125cc4d790007d4a155"
[[package]]
name = "zstd"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.16+zstd.1.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
dependencies = [
"cc",
"pkg-config",
]

View file

@ -3,6 +3,10 @@ name = "json-archive"
version = "0.99.0" version = "0.99.0"
edition = "2021" edition = "2021"
[features]
default = ["compression"]
compression = ["flate2", "brotli", "zstd"]
[dependencies] [dependencies]
xflags = "0.3" xflags = "0.3"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
@ -10,6 +14,11 @@ serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
uuid = { version = "1.0", features = ["v4", "serde"] } uuid = { version = "1.0", features = ["v4", "serde"] }
# Compression support (optional, enabled by default)
flate2 = { version = "1.0", optional = true }
brotli = { version = "8.0", optional = true }
zstd = { version = "0.13", optional = true }
[dev-dependencies] [dev-dependencies]
tempfile = "3.0" tempfile = "3.0"
arbitrary = { version = "1.0", features = ["derive"] } arbitrary = { version = "1.0", features = ["derive"] }

View file

@ -57,12 +57,20 @@ json-archive videoID.info.json
### Compression support (as a concession) ### Compression support (as a concession)
While the core design keeps things simple and readable, the tool does work with compressed archives as a practical concession for those who need it. You can read from and write to gzip, brotli, and zlib compressed files without special flags. While the core design keeps things simple and readable, the tool does work with compressed archives as a practical concession for those who need it. You can read from and write to gzip, deflate, zlib, brotli, and zstd compressed files without special flags.
**Important caveat**: Compressed archives may require rewriting the entire file during updates (depending on the compression format). If your temporary filesystem is full or too small, updates can fail. In that case, manually specify an output destination with `-o` to write the new archive elsewhere. **Important caveat**: Compressed archives may require rewriting the entire file during updates (depending on the compression format). If your temporary filesystem is full or too small, updates can fail. In that case, manually specify an output destination with `-o` to write the new archive elsewhere.
This works fine for the happy path with archive files up to a few hundred megabytes, but contradicts the "keep it simple" design philosophy - it's included because it's practically useful. This works fine for the happy path with archive files up to a few hundred megabytes, but contradicts the "keep it simple" design philosophy - it's included because it's practically useful.
**Building without compression**: Compression libraries are a security vulnerability vector. The default build includes them because most users want convenience. If you don't want to bundle compression libraries:
```bash
cargo install json-archive --no-default-features
```
The minimal build detects compressed files and errors with a clear message explaining you need the full version or manual decompression.
## Archive format ## Archive format
The format is JSONL with delta-based changes using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths. For complete technical details about the file format, see the [file format specification](docs/file-format-spec.md). The format is JSONL with delta-based changes using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths. For complete technical details about the file format, see the [file format specification](docs/file-format-spec.md).

Binary file not shown.

Binary file not shown.

View file

@ -21,11 +21,8 @@
use crate::flags; use crate::flags;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use json_archive::{Diagnostic, DiagnosticCode, DiagnosticLevel}; use json_archive::{Diagnostic, DiagnosticCode, DiagnosticLevel, Event};
use serde::Serialize; use serde::Serialize;
use serde_json::Value;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path; use std::path::Path;
#[derive(Debug)] #[derive(Debug)]
@ -53,6 +50,8 @@ struct JsonInfoOutput {
file_size: u64, file_size: u64,
snapshot_count: usize, snapshot_count: usize,
observations: Vec<JsonObservation>, observations: Vec<JsonObservation>,
total_json_size: u64,
efficiency_percent: f64,
} }
pub fn run(flags: &flags::Info) -> Vec<Diagnostic> { pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
@ -69,8 +68,8 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
)]; )];
} }
let observations = match collect_observations(&flags.file) { let (observations, snapshot_count) = match collect_observations(&flags.file) {
Ok(obs) => obs, Ok((obs, count)) => (obs, count),
Err(diagnostics) => return diagnostics, Err(diagnostics) => return diagnostics,
}; };
@ -79,7 +78,15 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
Err(_) => 0, Err(_) => 0,
}; };
let snapshot_count = count_snapshots(&flags.file).unwrap_or(0); // Calculate total JSON size (sum of all observations + newline separators)
let total_json_size: u64 = observations.iter().map(|obs| obs.json_size as u64).sum::<u64>()
+ (observations.len() as u64).saturating_sub(1); // Add newlines between observations
let efficiency_percent = if total_json_size > 0 {
(file_size as f64 / total_json_size as f64) * 100.0
} else {
0.0
};
// Check output format // Check output format
let is_json_output = flags.output.as_ref().map(|s| s == "json").unwrap_or(false); let is_json_output = flags.output.as_ref().map(|s| s == "json").unwrap_or(false);
@ -93,6 +100,8 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
file_size, file_size,
snapshot_count, snapshot_count,
observations: Vec::new(), observations: Vec::new(),
total_json_size: 0,
efficiency_percent: 0.0,
}; };
println!( println!(
"{}", "{}",
@ -123,6 +132,8 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
file_size, file_size,
snapshot_count, snapshot_count,
observations: json_observations, observations: json_observations,
total_json_size,
efficiency_percent,
}; };
println!( println!(
@ -193,10 +204,22 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
} else { } else {
format!("{} snapshots", snapshot_count) format!("{} snapshots", snapshot_count)
}; };
let comparison = if efficiency_percent < 100.0 {
format!("{:.1}% smaller", 100.0 - efficiency_percent)
} else {
format!("{:.1}% larger", efficiency_percent - 100.0)
};
println!( println!(
"Total archive size: {} ({})", "Archive size: {} ({}, {} than JSON Lines)",
format_size(file_size), format_size(file_size),
snapshot_text snapshot_text,
comparison
);
println!(
"Data size: {}",
format_size(total_json_size)
); );
// Add usage instructions // Add usage instructions
@ -222,9 +245,9 @@ pub fn run(flags: &flags::Info) -> Vec<Diagnostic> {
Vec::new() Vec::new()
} }
fn collect_observations(file_path: &Path) -> Result<Vec<ObservationInfo>, Vec<Diagnostic>> { fn collect_observations(file_path: &Path) -> Result<(Vec<ObservationInfo>, usize), Vec<Diagnostic>> {
let file = match File::open(file_path) { let reader = match json_archive::ArchiveReader::new(file_path, json_archive::ReadMode::AppendSeek) {
Ok(f) => f, Ok(r) => r,
Err(e) => { Err(e) => {
return Err(vec![Diagnostic::new( return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
@ -234,44 +257,32 @@ fn collect_observations(file_path: &Path) -> Result<Vec<ObservationInfo>, Vec<Di
} }
}; };
let reader = BufReader::new(file); let (initial_state, mut event_iter) = match reader.events(file_path) {
let mut lines = reader.lines(); Ok(r) => r,
let mut observations = Vec::new();
// Parse header
let header_line = match lines.next() {
Some(Ok(line)) => line,
_ => {
return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::EmptyFile,
"Archive file is empty or unreadable".to_string(),
)]);
}
};
let header: Value = match serde_json::from_str(&header_line) {
Ok(h) => h,
Err(e) => { Err(e) => {
return Err(vec![Diagnostic::new( return Err(vec![Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::MissingHeader, DiagnosticCode::PathNotFound,
format!("I couldn't parse the header: {}", e), format!("I couldn't read the archive file: {}", e),
)]); )]);
} }
}; };
let created_str = header["created"].as_str().unwrap_or(""); // Check for fatal diagnostics from initial parsing
let created: DateTime<Utc> = match created_str.parse() { if event_iter.diagnostics.has_fatal() {
Ok(dt) => dt, return Err(event_iter.diagnostics.diagnostics().to_vec());
Err(_) => Utc::now(), }
};
let mut observations = Vec::new();
let mut current_state = initial_state.clone();
let mut snapshot_count = 0;
let initial_state = header["initial"].clone();
let initial_size = serde_json::to_string(&initial_state) let initial_size = serde_json::to_string(&initial_state)
.unwrap_or_default() .unwrap_or_default()
.len(); .len();
let created = event_iter.header.created;
// Add initial state as observation 0 // Add initial state as observation 0
observations.push(ObservationInfo { observations.push(ObservationInfo {
id: "initial".to_string(), id: "initial".to_string(),
@ -281,54 +292,73 @@ fn collect_observations(file_path: &Path) -> Result<Vec<ObservationInfo>, Vec<Di
json_size: initial_size, json_size: initial_size,
}); });
let mut current_state = initial_state; // Iterate through events
while let Some(event) = event_iter.next() {
// Parse events match event {
for line in lines { Event::Observe { observation_id, timestamp, change_count } => {
let line = match line {
Ok(l) => l,
Err(_) => continue,
};
if line.trim().starts_with('#') || line.trim().is_empty() {
continue;
}
let event: Value = match serde_json::from_str(&line) {
Ok(e) => e,
Err(_) => continue,
};
if let Some(arr) = event.as_array() {
if arr.is_empty() {
continue;
}
let event_type = arr[0].as_str().unwrap_or("");
if event_type == "observe" && arr.len() >= 4 {
let obs_id = arr[1].as_str().unwrap_or("").to_string();
let timestamp_str = arr[2].as_str().unwrap_or("");
let change_count = arr[3].as_u64().unwrap_or(0) as usize;
let timestamp: DateTime<Utc> = match timestamp_str.parse() {
Ok(dt) => dt,
Err(_) => continue,
};
observations.push(ObservationInfo { observations.push(ObservationInfo {
id: obs_id, id: observation_id,
timestamp, timestamp,
created, created,
change_count, change_count,
json_size: 0, // Will be calculated after applying events json_size: 0, // Will be calculated after applying events
}); });
} else { }
// Apply the event to current_state for size calculation Event::Add { path, value, .. } => {
apply_event_to_state(&mut current_state, &arr); let _ = json_archive::apply_add(&mut current_state, &path, value);
// Update the JSON size of the last observation // Update the JSON size of the last observation
if let Some(last_obs) = observations.last_mut() { if let Some(last_obs) = observations.last_mut() {
if last_obs.id != "initial" {
last_obs.json_size = serde_json::to_string(&current_state)
.unwrap_or_default()
.len();
}
}
}
Event::Change { path, new_value, .. } => {
let _ = json_archive::apply_change(&mut current_state, &path, new_value);
// Update the JSON size of the last observation
if let Some(last_obs) = observations.last_mut() {
if last_obs.id != "initial" {
last_obs.json_size = serde_json::to_string(&current_state)
.unwrap_or_default()
.len();
}
}
}
Event::Remove { path, .. } => {
let _ = json_archive::apply_remove(&mut current_state, &path);
// Update the JSON size of the last observation
if let Some(last_obs) = observations.last_mut() {
if last_obs.id != "initial" {
last_obs.json_size = serde_json::to_string(&current_state)
.unwrap_or_default()
.len();
}
}
}
Event::Move { path, moves, .. } => {
let _ = json_archive::apply_move(&mut current_state, &path, moves);
// Update the JSON size of the last observation
if let Some(last_obs) = observations.last_mut() {
if last_obs.id != "initial" {
last_obs.json_size = serde_json::to_string(&current_state)
.unwrap_or_default()
.len();
}
}
}
Event::Snapshot { object, .. } => {
current_state = object;
snapshot_count += 1;
// Update the JSON size of the last observation
if let Some(last_obs) = observations.last_mut() {
if last_obs.id != "initial" {
last_obs.json_size = serde_json::to_string(&current_state) last_obs.json_size = serde_json::to_string(&current_state)
.unwrap_or_default() .unwrap_or_default()
.len(); .len();
@ -336,41 +366,11 @@ fn collect_observations(file_path: &Path) -> Result<Vec<ObservationInfo>, Vec<Di
} }
} }
} }
Ok(observations)
} }
fn apply_event_to_state(state: &mut Value, event: &[Value]) { Ok((observations, snapshot_count))
if event.is_empty() {
return;
} }
let event_type = event[0].as_str().unwrap_or("");
match event_type {
"add" if event.len() >= 3 => {
let path = event[1].as_str().unwrap_or("");
let value = event[2].clone();
if let Ok(pointer) = json_archive::pointer::JsonPointer::new(path) {
let _ = pointer.set(state, value);
}
}
"change" if event.len() >= 3 => {
let path = event[1].as_str().unwrap_or("");
let value = event[2].clone();
if let Ok(pointer) = json_archive::pointer::JsonPointer::new(path) {
let _ = pointer.set(state, value);
}
}
"remove" if event.len() >= 2 => {
let path = event[1].as_str().unwrap_or("");
if let Ok(pointer) = json_archive::pointer::JsonPointer::new(path) {
let _ = pointer.remove(state);
}
}
_ => {}
}
}
fn format_timestamp(dt: &DateTime<Utc>) -> String { fn format_timestamp(dt: &DateTime<Utc>) -> String {
dt.format("%a %H:%M:%S %d-%b-%Y").to_string() dt.format("%a %H:%M:%S %d-%b-%Y").to_string()
@ -393,18 +393,3 @@ fn format_size(bytes: u64) -> String {
format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0)) format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
} }
} }
fn count_snapshots(file_path: &Path) -> Result<usize, std::io::Error> {
let file = File::open(file_path)?;
let reader = BufReader::new(file);
let mut count = 0;
for line in reader.lines() {
let line = line?;
if line.trim().starts_with('[') && line.contains("\"snapshot\"") {
count += 1;
}
}
Ok(count)
}

View file

@ -36,4 +36,4 @@ pub use detection::is_json_archive;
pub use diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; pub use diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel};
pub use events::{Event, Header, Observation}; pub use events::{Event, Header, Observation};
pub use pointer::JsonPointer; pub use pointer::JsonPointer;
pub use reader::{ArchiveReader, ReadMode, ReadResult}; pub use reader::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, ReadMode, ReadResult};

View file

@ -22,7 +22,7 @@
use serde_json::Value; use serde_json::Value;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader, Read};
use std::path::Path; use std::path::Path;
use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel};
@ -30,12 +30,29 @@ use crate::event_deserialize::EventDeserializer;
use crate::events::{Event, Header}; use crate::events::{Event, Header};
use crate::pointer::JsonPointer; use crate::pointer::JsonPointer;
#[cfg(feature = "compression")]
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
#[cfg(feature = "compression")]
use brotli::Decompressor;
#[cfg(feature = "compression")]
use zstd::stream::read::Decoder as ZstdDecoder;
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReadMode { pub enum ReadMode {
FullValidation, FullValidation,
AppendSeek, AppendSeek,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CompressionFormat {
Gzip,
Deflate,
Zlib,
Brotli,
Zstd,
None,
}
pub struct ArchiveReader { pub struct ArchiveReader {
mode: ReadMode, mode: ReadMode,
filename: String, filename: String,
@ -49,134 +66,46 @@ pub struct ReadResult {
pub observation_count: usize, pub observation_count: usize,
} }
impl ArchiveReader { pub struct EventIterator {
pub fn new<P: AsRef<Path>>(path: P, mode: ReadMode) -> std::io::Result<Self> { reader: Box<dyn BufRead>,
let filename = path.as_ref().display().to_string(); pub diagnostics: DiagnosticCollector,
Ok(Self { mode, filename }) pub header: Header,
filename: String,
line_number: usize,
} }
pub fn read<P: AsRef<Path>>(&self, path: P) -> std::io::Result<ReadResult> { impl Iterator for EventIterator {
let file = File::open(path)?; type Item = Event;
let reader = BufReader::new(file);
let mut diagnostics = DiagnosticCollector::new();
let mut lines_iter = reader.lines().enumerate(); fn next(&mut self) -> Option<Self::Item> {
let mut line = String::new();
let (header_line_number, header_line) = match lines_iter.next() { loop {
Some((idx, Ok(line))) => (idx + 1, line), line.clear();
Some((idx, Err(e))) if e.kind() == std::io::ErrorKind::InvalidData => { self.line_number += 1;
let line_number = idx + 1;
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidUtf8,
format!("I found invalid UTF-8 bytes at line {}.", line_number)
)
.with_location(self.filename.clone(), line_number)
.with_advice(
"The JSON Archive format requires UTF-8 encoding. Make sure the file \
was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding."
.to_string()
)
);
return Ok(ReadResult {
header: Header::new(Value::Null, None),
final_state: Value::Null,
diagnostics,
observation_count: 0,
});
}
Some((_, Err(e))) => return Err(e),
None => {
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::EmptyFile,
"I found an empty file, but I need at least a header line.".to_string(),
)
.with_location(self.filename.clone(), 1)
.with_advice(
"A valid JSON Archive file must start with a header object containing:\n\
- type: \"@peoplesgrocers/json-archive\"\n\
- version: 1\n\
- created: an ISO-8601 timestamp\n\
- initial: the initial state of the object"
.to_string(),
),
);
return Ok(ReadResult {
header: Header::new(Value::Null, None),
final_state: Value::Null,
diagnostics,
observation_count: 0,
});
}
};
let header = match self.parse_header(&header_line, header_line_number, &mut diagnostics) {
Some(h) => h,
None => {
return Ok(ReadResult {
header: Header::new(Value::Null, None),
final_state: Value::Null,
diagnostics,
observation_count: 0,
});
}
};
let mut state = header.initial.clone();
let mut seen_observations: HashSet<String> = HashSet::new();
let mut current_observation: Option<(String, usize, usize)> = None;
let mut events_in_observation = 0;
let mut observation_count = 0;
// This manual dispatcher mirrors what serde would expand but stays explicit so we can
// attach Elm-style diagnostics with precise spans and guidance for each failure case.
for (idx, line_result) in lines_iter {
let line_number = idx + 1;
let line = match line_result {
Ok(line) => line,
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidUtf8,
format!("I found invalid UTF-8 bytes at line {}.", line_number)
)
.with_location(self.filename.clone(), line_number)
.with_advice(
"The JSON Archive format requires UTF-8 encoding. Make sure the file \
was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding."
.to_string()
)
);
return Ok(ReadResult {
header: Header::new(Value::Null, None),
final_state: Value::Null,
diagnostics,
observation_count: 0,
});
}
Err(e) => return Err(e),
};
match self.reader.read_line(&mut line) {
Ok(0) => return None, // EOF
Ok(_) => {
let trimmed = line.trim(); let trimmed = line.trim();
// Skip comments and blank lines
if trimmed.starts_with('#') || trimmed.is_empty() { if trimmed.starts_with('#') || trimmed.is_empty() {
continue; continue;
} }
// Try to parse as event
let event_deserializer = match serde_json::from_str::<EventDeserializer>(&line) { let event_deserializer = match serde_json::from_str::<EventDeserializer>(&line) {
Ok(d) => d, Ok(d) => d,
Err(e) => { Err(e) => {
diagnostics.add( self.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::InvalidEventJson, DiagnosticCode::InvalidEventJson,
format!("I couldn't parse this line as JSON: {}", e), format!("I couldn't parse this line as JSON: {}", e),
) )
.with_location(self.filename.clone(), line_number) .with_location(self.filename.clone(), self.line_number)
.with_snippet(format!("{} | {}", line_number, line)) .with_snippet(format!("{} | {}", self.line_number, line.trim()))
.with_advice( .with_advice(
"Each line after the header must be either:\n\ "Each line after the header must be either:\n\
- A comment starting with #\n\ - A comment starting with #\n\
@ -189,31 +118,261 @@ impl ArchiveReader {
} }
}; };
// Add any diagnostics from deserialization with location info // Add any diagnostics from deserialization
for diagnostic in event_deserializer.diagnostics { for diagnostic in event_deserializer.diagnostics {
diagnostics.add( self.diagnostics.add(
diagnostic diagnostic
.with_location(self.filename.clone(), line_number) .with_location(self.filename.clone(), self.line_number)
.with_snippet(format!("{} | {}", line_number, line)) .with_snippet(format!("{} | {}", self.line_number, line.trim()))
); );
} }
// Continue processing to collect additional errors before failing. // Return event if we have one
// Even though this function must now return an error, we continue to help if let Some(event) = event_deserializer.event {
// the user identify all issues in the file at once rather than one at a time. return Some(event);
let event = match event_deserializer.event { }
Some(e) => e,
None => { // If no event but had diagnostics, continue to next line
assert!(diagnostics.has_fatal(), "Expected a fatal diagnostic when deserialization fails"); continue;
continue }
}, Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
self.diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidUtf8,
format!("I found invalid UTF-8 bytes at line {}.", self.line_number)
)
.with_location(self.filename.clone(), self.line_number)
.with_advice(
"The JSON Archive format requires UTF-8 encoding. Make sure the file \
was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding."
.to_string()
)
);
return None;
}
Err(_) => return None,
}
}
}
}
fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat {
if bytes.len() < 4 {
return CompressionFormat::None;
}
// Gzip magic number: 0x1f 0x8b
if bytes[0] == 0x1f && bytes[1] == 0x8b {
return CompressionFormat::Gzip;
}
// Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda
if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) {
return CompressionFormat::Zlib;
}
// Zstd magic number: 0x28 0xb5 0x2f 0xfd
if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd {
return CompressionFormat::Zstd;
}
// Check file extension for brotli (no reliable magic number) and deflate
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy();
if ext_str == "br" || path.to_string_lossy().contains(".br.") {
return CompressionFormat::Brotli;
}
if ext_str == "deflate" {
return CompressionFormat::Deflate;
}
}
CompressionFormat::None
}
impl ArchiveReader {
pub fn new<P: AsRef<Path>>(path: P, mode: ReadMode) -> std::io::Result<Self> {
let filename = path.as_ref().display().to_string();
Ok(Self { mode, filename })
}
pub fn events<P: AsRef<Path>>(&self, path: P) -> std::io::Result<(Value, EventIterator)> {
let path = path.as_ref();
let mut file = File::open(path)?;
// Detect compression format
let mut magic_bytes = [0u8; 4];
let bytes_read = file.read(&mut magic_bytes)?;
let compression_format = detect_compression_format(path, &magic_bytes[..bytes_read]);
// Re-open file to reset position
file = File::open(path)?;
let mut diagnostics = DiagnosticCollector::new();
// Check if compression is detected but not supported
#[cfg(not(feature = "compression"))]
if compression_format != CompressionFormat::None {
let format_name = match compression_format {
CompressionFormat::Gzip => "gzip",
CompressionFormat::Deflate => "deflate",
CompressionFormat::Zlib => "zlib",
CompressionFormat::Brotli => "brotli",
CompressionFormat::Zstd => "zstd",
CompressionFormat::None => unreachable!(),
}; };
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::UnsupportedVersion,
format!("I detected a {}-compressed archive, but this build doesn't support compression.", format_name)
)
.with_location(self.filename.clone(), 1)
.with_advice(
"This binary was built without compression support to reduce binary size and dependencies.\n\
You have two options:\n\
1. Install the version with compression support: cargo install json-archive --features compression\n\
2. Manually decompress the file first, then use this tool on the uncompressed archive"
.to_string()
)
);
// Return dummy values with fatal diagnostic
let iterator = EventIterator {
reader: Box::new(BufReader::new(std::io::empty())),
diagnostics,
header: Header::new(Value::Null, None),
filename: self.filename.clone(),
line_number: 1,
};
return Ok((Value::Null, iterator));
}
// Create appropriate reader based on compression format
#[cfg(feature = "compression")]
let reader: Box<dyn BufRead> = match compression_format {
CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))),
CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))),
CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))),
CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))),
CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)),
CompressionFormat::None => Box::new(BufReader::new(file)),
};
#[cfg(not(feature = "compression"))]
let reader: Box<dyn BufRead> = Box::new(BufReader::new(file));
let mut reader = reader;
let mut header_line = String::new();
let _bytes_read = match reader.read_line(&mut header_line) {
Ok(0) => {
// Empty file
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::EmptyFile,
"I found an empty file, but I need at least a header line.".to_string(),
)
.with_location(self.filename.clone(), 1)
.with_advice(
"See the file format specification for header structure."
.to_string(),
),
);
let iterator = EventIterator {
reader,
diagnostics,
header: Header::new(Value::Null, None),
filename: self.filename.clone(),
line_number: 1,
};
return Ok((Value::Null, iterator));
}
Ok(n) => n,
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
// UTF-8 error
diagnostics.add(
Diagnostic::new(
DiagnosticLevel::Fatal,
DiagnosticCode::InvalidUtf8,
"I found invalid UTF-8 bytes at line 1.".to_string()
)
.with_location(self.filename.clone(), 1)
.with_advice(
"The JSON Archive format requires UTF-8 encoding. Make sure the file \
was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding."
.to_string()
)
);
let iterator = EventIterator {
reader,
diagnostics,
header: Header::new(Value::Null, None),
filename: self.filename.clone(),
line_number: 1,
};
return Ok((Value::Null, iterator));
}
Err(e) => return Err(e),
};
let header = match self.parse_header(&header_line, 1, &mut diagnostics) {
Some(h) => h,
None => {
let iterator = EventIterator {
reader,
diagnostics,
header: Header::new(Value::Null, None),
filename: self.filename.clone(),
line_number: 1,
};
return Ok((Value::Null, iterator));
}
};
let iterator = EventIterator {
reader,
diagnostics,
header: header.clone(),
filename: self.filename.clone(),
line_number: 1,
};
Ok((header.initial, iterator))
}
pub fn read<P: AsRef<Path>>(&self, path: P) -> std::io::Result<ReadResult> {
let (initial_value, mut event_iter) = self.events(&path)?;
// Check for early fatal diagnostics (like compression not supported)
if event_iter.diagnostics.has_fatal() {
return Ok(ReadResult {
header: Header::new(Value::Null, None),
final_state: Value::Null,
diagnostics: event_iter.diagnostics,
observation_count: 0,
});
}
let header = Header::new(initial_value.clone(), None);
let mut state = initial_value;
let mut seen_observations: HashSet<String> = HashSet::new();
let mut current_observation: Option<(String, usize, usize)> = None;
let mut events_in_observation = 0;
let mut observation_count = 0;
// Process events from iterator
while let Some(event) = event_iter.next() {
let line_number = event_iter.line_number;
match event { match event {
Event::Observe { observation_id, timestamp: _, change_count } => { Event::Observe { observation_id, timestamp: _, change_count } => {
if let Some((_obs_id, obs_line, expected_count)) = &current_observation { if let Some((_obs_id, obs_line, expected_count)) = &current_observation {
if events_in_observation != *expected_count { if events_in_observation != *expected_count {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Warning, DiagnosticLevel::Warning,
DiagnosticCode::ChangeCountMismatch, DiagnosticCode::ChangeCountMismatch,
@ -233,7 +392,7 @@ impl ArchiveReader {
} }
if seen_observations.contains(&observation_id) { if seen_observations.contains(&observation_id) {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Warning, DiagnosticLevel::Warning,
DiagnosticCode::DuplicateObservationId, DiagnosticCode::DuplicateObservationId,
@ -260,14 +419,13 @@ impl ArchiveReader {
if self.mode == ReadMode::FullValidation if self.mode == ReadMode::FullValidation
&& !seen_observations.contains(&observation_id) && !seen_observations.contains(&observation_id)
{ {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::NonExistentObservationId, DiagnosticCode::NonExistentObservationId,
format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id)
) )
.with_location(self.filename.clone(), line_number) .with_location(self.filename.clone(), line_number)
.with_snippet(format!("{} | {}", line_number, line))
.with_advice( .with_advice(
"Each add/change/remove/move event must reference an observation ID from a preceding observe event." "Each add/change/remove/move event must reference an observation ID from a preceding observe event."
.to_string() .to_string()
@ -277,7 +435,7 @@ impl ArchiveReader {
} }
if let Err(diag) = apply_add(&mut state, &path, value) { if let Err(diag) = apply_add(&mut state, &path, value) {
diagnostics.add(diag.with_location(self.filename.clone(), line_number)); event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number));
continue; continue;
} }
} }
@ -288,7 +446,7 @@ impl ArchiveReader {
if self.mode == ReadMode::FullValidation if self.mode == ReadMode::FullValidation
&& !seen_observations.contains(&observation_id) && !seen_observations.contains(&observation_id)
{ {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::NonExistentObservationId, DiagnosticCode::NonExistentObservationId,
@ -300,7 +458,7 @@ impl ArchiveReader {
} }
if let Err(diag) = apply_change(&mut state, &path, new_value) { if let Err(diag) = apply_change(&mut state, &path, new_value) {
diagnostics.add(diag.with_location(self.filename.clone(), line_number)); event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number));
continue; continue;
} }
} }
@ -311,7 +469,7 @@ impl ArchiveReader {
if self.mode == ReadMode::FullValidation if self.mode == ReadMode::FullValidation
&& !seen_observations.contains(&observation_id) && !seen_observations.contains(&observation_id)
{ {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::NonExistentObservationId, DiagnosticCode::NonExistentObservationId,
@ -323,7 +481,7 @@ impl ArchiveReader {
} }
if let Err(diag) = apply_remove(&mut state, &path) { if let Err(diag) = apply_remove(&mut state, &path) {
diagnostics.add(diag.with_location(self.filename.clone(), line_number)); event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number));
continue; continue;
} }
} }
@ -334,7 +492,7 @@ impl ArchiveReader {
if self.mode == ReadMode::FullValidation if self.mode == ReadMode::FullValidation
&& !seen_observations.contains(&observation_id) && !seen_observations.contains(&observation_id)
{ {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::NonExistentObservationId, DiagnosticCode::NonExistentObservationId,
@ -346,14 +504,14 @@ impl ArchiveReader {
} }
if let Err(diag) = apply_move(&mut state, &path, moves) { if let Err(diag) = apply_move(&mut state, &path, moves) {
diagnostics.add(diag.with_location(self.filename.clone(), line_number)); event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number));
continue; continue;
} }
} }
Event::Snapshot { observation_id: _, timestamp: _, object } => { Event::Snapshot { observation_id: _, timestamp: _, object } => {
if self.mode == ReadMode::FullValidation && state != object { if self.mode == ReadMode::FullValidation && state != object {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Fatal, DiagnosticLevel::Fatal,
DiagnosticCode::SnapshotStateMismatch, DiagnosticCode::SnapshotStateMismatch,
@ -376,7 +534,7 @@ impl ArchiveReader {
if let Some((_obs_id, obs_line, expected_count)) = &current_observation { if let Some((_obs_id, obs_line, expected_count)) = &current_observation {
if events_in_observation != *expected_count { if events_in_observation != *expected_count {
diagnostics.add( event_iter.diagnostics.add(
Diagnostic::new( Diagnostic::new(
DiagnosticLevel::Warning, DiagnosticLevel::Warning,
DiagnosticCode::ChangeCountMismatch, DiagnosticCode::ChangeCountMismatch,
@ -393,10 +551,11 @@ impl ArchiveReader {
Ok(ReadResult { Ok(ReadResult {
header, header,
final_state: state, final_state: state,
diagnostics, diagnostics: event_iter.diagnostics,
observation_count, observation_count,
}) })
} }
fn parse_header( fn parse_header(
&self, &self,
line: &str, line: &str,
@ -470,7 +629,7 @@ impl ArchiveReader {
} }
fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diagnostic> { pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diagnostic> {
let pointer = JsonPointer::new(path).map_err(|diag| { let pointer = JsonPointer::new(path).map_err(|diag| {
diag.with_advice( diag.with_advice(
"JSON Pointer paths must start with '/' and use '/' to separate segments.\n\ "JSON Pointer paths must start with '/' and use '/' to separate segments.\n\
@ -488,19 +647,19 @@ fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diagnost
}) })
} }
fn apply_change(state: &mut Value, path: &str, new_value: Value) -> Result<(), Diagnostic> { pub fn apply_change(state: &mut Value, path: &str, new_value: Value) -> Result<(), Diagnostic> {
let pointer = JsonPointer::new(path)?; let pointer = JsonPointer::new(path)?;
pointer.set(state, new_value)?; pointer.set(state, new_value)?;
Ok(()) Ok(())
} }
fn apply_remove(state: &mut Value, path: &str) -> Result<(), Diagnostic> { pub fn apply_remove(state: &mut Value, path: &str) -> Result<(), Diagnostic> {
let pointer = JsonPointer::new(path)?; let pointer = JsonPointer::new(path)?;
pointer.remove(state)?; pointer.remove(state)?;
Ok(()) Ok(())
} }
fn apply_move( pub fn apply_move(
state: &mut Value, state: &mut Value,
path: &str, path: &str,
moves: Vec<(usize, usize)>, moves: Vec<(usize, usize)>,