From b65103c9f767fa72779cb56529790463ebff43d6 Mon Sep 17 00:00:00 2001 From: nobody Date: Mon, 1 Dec 2025 20:51:50 -0800 Subject: [PATCH] refactor: decompose archive read/write into composable building blocks Delete archive_context.rs and archive_ops.rs (1200+ lines of duplicated logic). Replace with four focused modules: 1. open_archive() - opens a file, detects compression, returns raw bytes 2. read_archive() - parses bytes into validated observations 3. CompressionWriter - writes bytes with any compression format 4. WriteStrategy - given a list of files, determines input archive, output archive, output format, and which of four write modes to use: - Create: new archive, no input - Append: uncompressed input, seek to end - AtomicSwap: compressed input, rewrite via temp file - CopyOnWrite: different input/output paths, transcode between formats Previously you could not specify output format. Appending always preserved the input format, creating compressed archives didn't work. Now all four cases work with any supported compression format. Atomic swap now writes to temp file, then renames. Crash-safe. Trade-off: This approach prioritizes code clarity over syscall efficiency. The archive file may be opened and read multiple times during a single operation (once for format detection, once for reading state, once for copying content). A more optimized implementation could reuse file handles, but the current approach makes each step's purpose obvious. --- Cargo.lock | 2 +- Cargo.toml | 3 +- fuzz/fuzz_targets/fuzz_mutations.rs | 17 +- fuzz/fuzz_targets/fuzz_random_bytes.rs | 9 +- fuzz/fuzz_targets/fuzz_structured.rs | 15 +- src/archive_context.rs | 595 ---------- src/archive_open.rs | 26 +- src/archive_ops.rs | 644 ----------- src/archive_reader.rs | 868 +++++++------- src/archive_writer.rs | 1009 ++++------------- src/atomic_file.rs | 20 +- src/bin/pointer_errors_demo.rs | 30 +- src/cmd/info.rs | 95 +- src/cmd/mod.rs | 1 + src/cmd/state.rs | 55 +- src/cmd/write.rs | 316 ++++++ src/compression_writer.rs | 431 +++++++ src/detection.rs | 34 +- src/diagnostics.rs | 6 + src/event_deserialize.rs | 75 +- src/events.rs | 1 - src/flags.rs | 2 +- src/lib.rs | 14 +- src/main.rs | 171 +-- src/pointer_errors.rs | 7 +- src/write_strategy.rs | 352 ++++++ tests/compressed_archive_tests.rs | 112 +- .../run_brotli_test.sh | 4 +- 28 files changed, 2055 insertions(+), 2859 deletions(-) delete mode 100644 src/archive_context.rs delete mode 100644 src/archive_ops.rs create mode 100644 src/cmd/write.rs create mode 100644 src/compression_writer.rs create mode 100644 src/write_strategy.rs diff --git a/Cargo.lock b/Cargo.lock index eacf05e..a8cdb8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -234,7 +234,7 @@ dependencies = [ [[package]] name = "json-archive" -version = "0.99.0" +version = "0.99.1" dependencies = [ "arbitrary", "brotli", diff --git a/Cargo.toml b/Cargo.toml index 6d12d96..3139ed2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "json-archive" -version = "0.99.0" +version = "0.99.1" edition = "2021" authors = ["Karl ", "nobody "] +homepage = "https://peoplesgrocers.com/code/oss/json-archive" repository = "https://peoplesgrocers.com/code/oss/json-archive" license = "AGPL-3.0" description = "CLI tool for tracking JSON file changes over time using delta-based archives" diff --git a/fuzz/fuzz_targets/fuzz_mutations.rs b/fuzz/fuzz_targets/fuzz_mutations.rs index a633906..bf2ad7d 100644 --- a/fuzz/fuzz_targets/fuzz_mutations.rs +++ b/fuzz/fuzz_targets/fuzz_mutations.rs @@ -1,8 +1,8 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; fn create_archive_content(data: &[u8]) -> Vec { @@ -80,24 +80,25 @@ fn create_archive_content(data: &[u8]) -> Vec { fuzz_target!(|data: &[u8]| { let archive_content = create_archive_content(data); - + if let Ok(mut temp_file) = NamedTempFile::new() { if temp_file.write_all(&archive_content).is_ok() { // Test both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { - let result = reader.read(temp_file.path()); - + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); + let result = read_archive(reader, &temp_file.path().display().to_string(), mode); + // Should never panic, regardless of input malformation match result { Ok(read_result) => { // Basic invariants that should hold for any successful parse let _ = &read_result.final_state; let _ = &read_result.diagnostics; - + // Observation count should be reasonable assert!(read_result.observation_count < 100000); - + // If we have diagnostics, they should be well-formed for diagnostic in read_result.diagnostics.diagnostics() { assert!(!diagnostic.description.is_empty()); diff --git a/fuzz/fuzz_targets/fuzz_random_bytes.rs b/fuzz/fuzz_targets/fuzz_random_bytes.rs index 0d0895a..75bd96b 100644 --- a/fuzz/fuzz_targets/fuzz_random_bytes.rs +++ b/fuzz/fuzz_targets/fuzz_random_bytes.rs @@ -1,8 +1,8 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; fuzz_target!(|data: &[u8]| { @@ -11,10 +11,11 @@ fuzz_target!(|data: &[u8]| { if temp_file.write_all(data).is_ok() { // Try to read the file with both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); // The read operation should never panic, regardless of input // It should either succeed or return an error gracefully - let _ = reader.read(temp_file.path()); + let _ = read_archive(reader, &temp_file.path().display().to_string(), mode); } } } diff --git a/fuzz/fuzz_targets/fuzz_structured.rs b/fuzz/fuzz_targets/fuzz_structured.rs index ba85a58..e787b5d 100644 --- a/fuzz/fuzz_targets/fuzz_structured.rs +++ b/fuzz/fuzz_targets/fuzz_structured.rs @@ -2,8 +2,8 @@ use libfuzzer_sys::fuzz_target; use arbitrary::{Arbitrary, Unstructured}; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; use serde_json::{json, Value}; @@ -160,20 +160,21 @@ fuzz_target!(|data: &[u8]| { let mut u = Unstructured::new(data); if let Ok(archive) = FuzzArchive::arbitrary(&mut u) { let content = archive.generate_archive(); - + if let Ok(mut temp_file) = NamedTempFile::new() { if temp_file.write_all(content.as_bytes()).is_ok() { // Test both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { - let result = reader.read(temp_file.path()); - + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); + let result = read_archive(reader, &temp_file.path().display().to_string(), mode); + // The operation should never panic // Verify that diagnostics are properly generated for invalid structures if let Ok(read_result) = result { // Basic sanity checks on the result assert!(read_result.observation_count < 10000); // Reasonable upper bound - + // If there are fatal diagnostics, final state should be reasonable if read_result.diagnostics.has_fatal() { // Should still have some state (at least initial or null) diff --git a/src/archive_context.rs b/src/archive_context.rs deleted file mode 100644 index 7faeefd..0000000 --- a/src/archive_context.rs +++ /dev/null @@ -1,595 +0,0 @@ -// json-archive is a tool for tracking JSON file changes over time -// Copyright (C) 2025 Peoples Grocers LLC -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published -// by the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . -// -// To purchase a license under different terms contact admin@peoplesgrocers.com -// To request changes, report bugs, or give user feedback contact -// marxism@peoplesgrocers.com -// - -//! Archive write context and shared observation writing logic. -//! -//! This module provides: -//! - `WriteContext`: A struct that holds the state needed to write observations -//! - `write_observations`: The shared logic for diffing JSON files and writing events -//! -//! The key insight is that both create and append operations share the same -//! core logic once they've set up their initial state and writer. - -use chrono::{DateTime, Utc}; -use serde_json::Value; -use std::io::Write; -use std::path::{Path, PathBuf}; -use uuid::Uuid; - -use crate::atomic_file::atomic_replace_file; -use crate::detection::CompressionFormat; -use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector}; -use crate::diff; -use crate::events::{Event, Observation}; - -/// Strategy for finishing the write operation. -#[derive(Debug, Clone)] -pub enum FinishStrategy { - /// Just flush the writer. Used for: - /// - Creating new archives - /// - Appending to uncompressed archives (same file) - FlushOnly, - - /// Atomic replace: swap temp file with original. Used for: - /// - Appending to compressed archives (rewrite strategy) - AtomicReplace { - temp_path: PathBuf, - output_path: PathBuf, - }, -} - -/// Context for writing observations to an archive. -/// -/// This struct is the result of the "setup phase" for both create and append -/// operations. Once you have a WriteContext, you can use `write_observations` -/// to add new states, then call `finish` to complete the operation. -pub struct WriteContext { - /// The writer to output JSON lines to. - pub writer: W, - - /// Current state of the archive (used for diffing). - pub current_state: Value, - - /// Number of observations already in the archive. - pub observation_count: usize, - - /// Optional interval for writing snapshots. - pub snapshot_interval: Option, - - /// How to finish the write operation. - pub finish_strategy: FinishStrategy, - - /// Diagnostics collected during setup (e.g., warnings from reading existing archive). - pub diagnostics: DiagnosticCollector, -} - -impl WriteContext { - /// Create a new write context. - pub fn new( - writer: W, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - ) -> Self { - Self { - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics: DiagnosticCollector::new(), - } - } - - /// Create a write context with existing diagnostics. - pub fn with_diagnostics( - writer: W, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - diagnostics: DiagnosticCollector, - ) -> Self { - Self { - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics, - } - } - - /// Write observations for a list of JSON files. - /// - /// For each file: - /// 1. Reads and parses the JSON - /// 2. Diffs against current state - /// 3. Writes observation events - /// 4. Optionally writes a snapshot if interval is reached - /// 5. Updates current state - /// - /// Returns the number of observations written. - pub fn write_observations>( - &mut self, - files: &[P], - ) -> Result> { - let mut observations_written = 0; - - for file_path in files.iter() { - let file_path = file_path.as_ref(); - - // Write comment marking which file we're processing - if let Err(e) = writeln!(self.writer, "# Processing file: {}", file_path.display()) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - - // Get file modification time for the observation timestamp - let file_mtime = get_file_mtime(file_path)?; - - // Read and parse new state - let content = std::fs::read_to_string(file_path).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the input file '{}': {}", file_path.display(), e), - )] - })?; - - let new_state: Value = serde_json::from_str(&content).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse '{}' as JSON: {}", file_path.display(), e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - // Generate diff and create observation - let observation_id = format!("obs-{}", Uuid::new_v4()); - let diff_events = diff::diff(&self.current_state, &new_state, "", &observation_id); - - // Skip if no changes - if diff_events.is_empty() { - continue; - } - - // Create and write observation - let mut observation = Observation::new(observation_id, file_mtime); - for event in diff_events { - observation.add_event(event); - } - - self.write_observation(observation)?; - observations_written += 1; - self.observation_count += 1; - - // Check if we should write a snapshot - if self.should_write_snapshot() { - self.write_snapshot(&new_state, file_mtime)?; - } - - // Update current state for next iteration - self.current_state = new_state; - } - - Ok(observations_written) - } - - /// Write a single observation's events to the output. - fn write_observation(&mut self, observation: Observation) -> Result<(), Vec> { - for event in observation.to_events() { - let event_json = serde_json::to_string(&event).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize an event to JSON: {}", e), - )] - })?; - - writeln!(self.writer, "{}", event_json).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - })?; - } - - Ok(()) - } - - /// Check if we should write a snapshot based on observation count. - fn should_write_snapshot(&self) -> bool { - if let Some(interval) = self.snapshot_interval { - self.observation_count > 0 && self.observation_count % interval == 0 - } else { - false - } - } - - /// Write a snapshot event. - fn write_snapshot(&mut self, state: &Value, timestamp: DateTime) -> Result<(), Vec> { - let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); - let snapshot = Event::Snapshot { - observation_id: snapshot_id, - timestamp, - object: state.clone(), - }; - - let snapshot_json = serde_json::to_string(&snapshot).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the snapshot to JSON: {}", e), - )] - })?; - - writeln!(self.writer, "{}", snapshot_json).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - })?; - - Ok(()) - } - - /// Finish the write operation. - /// - /// This flushes the writer and, for compressed append operations, - /// performs the atomic file replacement. - pub fn finish(mut self) -> Result> { - // Flush the writer - self.writer.flush().map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )] - })?; - - // Handle atomic replacement if needed - match self.finish_strategy { - FinishStrategy::FlushOnly => { - // Nothing more to do - } - FinishStrategy::AtomicReplace { temp_path, output_path } => { - atomic_replace_file(&output_path, &temp_path)?; - } - } - - Ok(self.diagnostics) - } -} - -/// Get the file modification time as a DateTime. -fn get_file_mtime>(path: P) -> Result, Vec> { - let path = path.as_ref(); - let metadata = std::fs::metadata(path).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get metadata for '{}': {}", path.display(), e), - )] - })?; - - let modified = metadata.modified().map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get modification time for '{}': {}", path.display(), e), - )] - })?; - - Ok(modified.into()) -} - -/// Encoder wrapper that provides a uniform interface for different compression formats. -/// -/// This enum wraps the various compression encoders so we can treat them uniformly -/// in the append-to-compressed-archive flow. -#[cfg(feature = "compression")] -pub enum CompressedWriter { - Gzip(flate2::write::GzEncoder), - Zlib(flate2::write::ZlibEncoder), - Zstd(zstd::stream::write::Encoder<'static, std::fs::File>), - Brotli(brotli::CompressorWriter), -} - -#[cfg(feature = "compression")] -impl Write for CompressedWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - match self { - CompressedWriter::Gzip(w) => w.write(buf), - CompressedWriter::Zlib(w) => w.write(buf), - CompressedWriter::Zstd(w) => w.write(buf), - CompressedWriter::Brotli(w) => w.write(buf), - } - } - - fn flush(&mut self) -> std::io::Result<()> { - match self { - CompressedWriter::Gzip(w) => w.flush(), - CompressedWriter::Zlib(w) => w.flush(), - CompressedWriter::Zstd(w) => w.flush(), - CompressedWriter::Brotli(w) => w.flush(), - } - } -} - -#[cfg(feature = "compression")] -impl CompressedWriter { - /// Create a new compressed writer for the given format and file. - pub fn new(format: CompressionFormat, file: std::fs::File) -> Result { - use flate2::Compression; - - match format { - CompressionFormat::Gzip => { - Ok(CompressedWriter::Gzip(flate2::write::GzEncoder::new(file, Compression::default()))) - } - CompressionFormat::Zlib => { - Ok(CompressedWriter::Zlib(flate2::write::ZlibEncoder::new(file, Compression::default()))) - } - CompressionFormat::Zstd => { - let encoder = zstd::stream::write::Encoder::new(file, 0).map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd encoder: {}", e), - ) - })?; - Ok(CompressedWriter::Zstd(encoder)) - } - CompressionFormat::Brotli => { - Ok(CompressedWriter::Brotli(brotli::CompressorWriter::new(file, 4096, 11, 22))) - } - CompressionFormat::Deflate => { - // Deflate is typically used within gzip/zlib, not standalone for files - Err(Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "Standalone deflate compression is not supported for writing.".to_string(), - )) - } - CompressionFormat::None => { - Err(Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "CompressedWriter::new called with CompressionFormat::None".to_string(), - )) - } - } - } - - /// Finish compression and return any errors. - /// - /// This must be called before the file is closed to ensure all - /// compressed data is flushed. - pub fn finish(self) -> Result<(), Diagnostic> { - match self { - CompressedWriter::Gzip(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish gzip compression: {}", e), - ) - })?; - } - CompressedWriter::Zlib(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish zlib compression: {}", e), - ) - })?; - } - CompressedWriter::Zstd(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish zstd compression: {}", e), - ) - })?; - } - CompressedWriter::Brotli(mut w) => { - // Brotli doesn't have a finish() method, flush is sufficient - w.flush().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush brotli compression: {}", e), - ) - })?; - } - } - Ok(()) - } -} - -/// A write context specifically for compressed output. -/// -/// This wraps WriteContext to handle the finish() call properly for -/// compressed writers, which need to call finish() on the encoder -/// before the atomic file swap. -#[cfg(feature = "compression")] -pub struct CompressedWriteContext { - /// The inner write context. - inner: WriteContext, -} - -#[cfg(feature = "compression")] -impl CompressedWriteContext { - /// Create a new compressed write context. - pub fn new( - writer: CompressedWriter, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - diagnostics: DiagnosticCollector, - ) -> Self { - Self { - inner: WriteContext::with_diagnostics( - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics, - ), - } - } - - /// Write observations for a list of JSON files. - pub fn write_observations>( - &mut self, - files: &[P], - ) -> Result> { - self.inner.write_observations(files) - } - - /// Write raw bytes to the output (used for copying existing archive content). - pub fn write_raw(&mut self, bytes: &[u8]) -> Result<(), Vec> { - self.inner.writer.write_all(bytes).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - }) - } - - /// Finish the write operation. - /// - /// This finishes the compression encoder, then performs any atomic - /// file operations needed. - pub fn finish(self) -> Result> { - let finish_strategy = self.inner.finish_strategy.clone(); - let diagnostics = self.inner.diagnostics; - - // Finish compression first - self.inner.writer.finish().map_err(|d| vec![d])?; - - // Then handle atomic replacement if needed - match finish_strategy { - FinishStrategy::FlushOnly => { - // Nothing more to do - } - FinishStrategy::AtomicReplace { temp_path, output_path } => { - atomic_replace_file(&output_path, &temp_path)?; - } - } - - Ok(diagnostics) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_write_context_single_observation() { - let mut output = Vec::new(); - let initial_state = json!({"count": 0}); - - { - let mut ctx = WriteContext::new( - &mut output, - initial_state, - 0, - None, - FinishStrategy::FlushOnly, - ); - - // Create a temp file with new state - let mut temp_file = tempfile::NamedTempFile::new().unwrap(); - std::io::Write::write_all(&mut temp_file, br#"{"count": 1}"#).unwrap(); - temp_file.flush().unwrap(); - - let count = ctx.write_observations(&[temp_file.path()]).unwrap(); - assert_eq!(count, 1); - } - - let output_str = String::from_utf8(output).unwrap(); - assert!(output_str.contains("# Processing file:")); - assert!(output_str.contains("observe")); - assert!(output_str.contains("change")); - assert!(output_str.contains("/count")); - } - - #[test] - fn test_write_context_no_changes() { - let mut output = Vec::new(); - let initial_state = json!({"count": 0}); - - { - let mut ctx = WriteContext::new( - &mut output, - initial_state, - 0, - None, - FinishStrategy::FlushOnly, - ); - - // Create a temp file with same state - let mut temp_file = tempfile::NamedTempFile::new().unwrap(); - std::io::Write::write_all(&mut temp_file, br#"{"count": 0}"#).unwrap(); - temp_file.flush().unwrap(); - - let count = ctx.write_observations(&[temp_file.path()]).unwrap(); - assert_eq!(count, 0); - } - - let output_str = String::from_utf8(output).unwrap(); - // Should have comment but no events - assert!(output_str.contains("# Processing file:")); - assert!(!output_str.contains("observe")); - } - - #[test] - fn test_should_write_snapshot() { - let output: Vec = Vec::new(); - - // No interval set - let ctx: WriteContext> = WriteContext::new( - output.clone(), - json!({}), - 5, - None, - FinishStrategy::FlushOnly, - ); - assert!(!ctx.should_write_snapshot()); - - // Interval of 2, at observation 4 (multiple of 2) - let ctx: WriteContext> = WriteContext::new( - output.clone(), - json!({}), - 4, - Some(2), - FinishStrategy::FlushOnly, - ); - assert!(ctx.should_write_snapshot()); - - // Interval of 2, at observation 3 (not multiple of 2) - let ctx: WriteContext> = WriteContext::new( - output, - json!({}), - 3, - Some(2), - FinishStrategy::FlushOnly, - ); - assert!(!ctx.should_write_snapshot()); - } -} diff --git a/src/archive_open.rs b/src/archive_open.rs index 68f3ea5..4683969 100644 --- a/src/archive_open.rs +++ b/src/archive_open.rs @@ -135,10 +135,17 @@ pub fn open_archive>(path: P) -> Result Result<(), Diagnostic> { #[cfg(not(feature = "compression"))] if format != CompressionFormat::None { @@ -154,11 +161,12 @@ pub fn check_compression_support( return Err(Diagnostic::fatal( DiagnosticCode::UnsupportedVersion, format!( - "I detected a {}-compressed archive, but this build doesn't support compression.", - format_name + "I inferred that you wanted to {} a {}-compressed archive at:\n\n {}\n\n\ + However, this build does not include compression libraries.", + action, format_name, filename.display() ), ) - .with_location(filename.to_string(), 1) + .with_location(filename.display().to_string(), 1) .with_advice( "This binary was built without compression support to reduce binary size and dependencies.\n\ You have two options:\n\ @@ -175,7 +183,9 @@ pub fn check_compression_support( /// /// This opens the file, reads magic bytes, and returns the compression format. /// Useful when you need to know the format before deciding how to process the file. -pub fn detect_archive_compression>(path: P) -> Result { +pub fn detect_archive_compression>( + path: P, +) -> Result { let path = path.as_ref(); let filename = path.display().to_string(); @@ -208,7 +218,11 @@ mod tests { #[test] fn test_open_uncompressed_archive() { let mut temp_file = NamedTempFile::new().unwrap(); - writeln!(temp_file, r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#).unwrap(); + writeln!( + temp_file, + r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"# + ) + .unwrap(); temp_file.flush().unwrap(); let opened = open_archive(temp_file.path()).unwrap(); diff --git a/src/archive_ops.rs b/src/archive_ops.rs deleted file mode 100644 index c03521a..0000000 --- a/src/archive_ops.rs +++ /dev/null @@ -1,644 +0,0 @@ -// json-archive is a tool for tracking JSON file changes over time -// Copyright (C) 2025 Peoples Grocers LLC -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published -// by the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . -// -// To purchase a license under different terms contact admin@peoplesgrocers.com -// To request changes, report bugs, or give user feedback contact -// marxism@peoplesgrocers.com -// - -//! High-level archive operations: create and append. -//! -//! This module provides the top-level entry points for creating and appending -//! to archives. These functions handle all the setup (opening files, detecting -//! compression, reading existing state) and then delegate to the shared -//! `WriteContext` for the actual observation writing. -//! -//! ## Architecture -//! -//! ```text -//! ┌─────────────────┐ -//! │ archive_ops.rs │ -//! │ (this module) │ -//! └────────┬────────┘ -//! │ -//! ┌─────────────────┼─────────────────┐ -//! │ │ │ -//! ▼ ▼ ▼ -//! ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ -//! │ archive_open │ │archive_context│ │ archive_reader│ -//! │ (compression) │ │ (WriteContext)│ │ (parsing) │ -//! └───────────────┘ └───────────────┘ └───────────────┘ -//! ``` -//! -//! ## Operations -//! -//! - `create_archive`: Create a new archive from one or more JSON files -//! - `append_to_archive`: Add observations to an existing archive - -use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Read, Write}; -use std::path::{Path, PathBuf}; - -use serde_json::Value; - -use crate::archive_context::{FinishStrategy, WriteContext}; -use crate::archive_open::{check_compression_support, detect_archive_compression, open_archive}; -use crate::archive_reader::{ArchiveReader, ReadMode}; -use crate::atomic_file::generate_temp_filename; -use crate::detection::CompressionFormat; -use crate::diagnostics::{Diagnostic, DiagnosticCode}; -use crate::events::Header; - -#[cfg(feature = "compression")] -use crate::archive_context::{CompressedWriteContext, CompressedWriter}; - -/// Create a new archive from a list of JSON files. -/// -/// The first file becomes the initial state in the header. Each subsequent -/// file generates an observation with the diff from the previous state. -/// -/// # Arguments -/// -/// * `input_files` - List of JSON files to process (at least one required) -/// * `output_path` - Path for the new archive file -/// * `source` - Optional source identifier for the header -/// * `snapshot_interval` - Optional interval for writing snapshots -/// -/// # Returns -/// -/// Returns an empty Vec on success, or a Vec of diagnostics on error. -pub fn create_archive>( - input_files: &[P], - output_path: P, - source: Option, - snapshot_interval: Option, -) -> Vec { - if input_files.is_empty() { - return vec![Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - "I need at least one input file to create an archive.".to_string(), - )]; - } - - // Read and parse the first file to get initial state - let first_path = input_files[0].as_ref(); - let first_content = match std::fs::read_to_string(first_path) { - Ok(content) => content, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the first input file '{}': {}", first_path.display(), e), - )]; - } - }; - - let initial_state: Value = match serde_json::from_str(&first_content) { - Ok(state) => state, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse '{}' as JSON: {}", first_path.display(), e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())]; - } - }; - - // Create the output file - let output_path = output_path.as_ref(); - let file = match File::create(output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create the output file '{}': {}", output_path.display(), e), - ) - .with_advice( - "Make sure you have write permission in this directory and that the path is valid." - .to_string(), - )]; - } - }; - - let mut writer = BufWriter::new(file); - - // Write the header - let header = Header::new(initial_state.clone(), source); - let header_json = match serde_json::to_string(&header) { - Ok(json) => json, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the header to JSON: {}", e), - )]; - } - }; - - if let Err(e) = writeln!(writer, "{}", header_json) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output file: {}", e), - )]; - } - - // If there are more files, process them through WriteContext - if input_files.len() > 1 { - let mut ctx = WriteContext::new( - writer, - initial_state, - 0, - snapshot_interval, - FinishStrategy::FlushOnly, - ); - - // Process remaining files (skip the first one which is now the initial state) - let remaining_files: Vec<&Path> = input_files[1..].iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&remaining_files) { - return diagnostics; - } - - if let Err(diagnostics) = ctx.finish() { - return diagnostics; - } - } else { - // Just flush the header - if let Err(e) = writer.flush() { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )]; - } - } - - Vec::new() -} - -/// Append observations to an existing archive. -/// -/// This function handles both compressed and uncompressed archives: -/// - Uncompressed: Opens in append mode and writes new observations directly -/// - Compressed: Reads entire archive, writes to temp file, atomic swap -/// -/// # Arguments -/// -/// * `archive_path` - Path to the existing archive -/// * `new_files` - List of JSON files to add as observations -/// * `output_path` - Where to write the result (can be same as archive_path) -/// * `source` - Optional source identifier (not currently used for append) -/// * `snapshot_interval` - Optional interval for writing snapshots -/// -/// # Returns -/// -/// Returns an empty Vec on success, or a Vec of diagnostics on error. -pub fn append_to_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - _source: Option, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Detect compression format - let format = match detect_archive_compression(archive_path) { - Ok(f) => f, - Err(diag) => return vec![diag], - }; - - // Check if this build supports the detected compression - if let Err(diag) = check_compression_support(format, &archive_path.display().to_string()) { - return vec![diag]; - } - - if format == CompressionFormat::None { - append_to_uncompressed_archive(archive_path, new_files, output_path, snapshot_interval) - } else { - append_to_compressed_archive(archive_path, new_files, output_path, format, snapshot_interval) - } -} - -/// Append to an uncompressed archive. -/// -/// This reads the archive to get the final state, then opens the file -/// in append mode to add new observations. -fn append_to_uncompressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Read the existing archive to get final state - let reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive for reading: {}", e), - )]; - } - }; - - let read_result = match reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics in the archive - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive." - .to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // If output path is different from archive path, copy the archive first - if archive_path != output_path { - if let Err(e) = std::fs::copy(archive_path, output_path) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't copy the archive to the output location: {}", e), - )]; - } - } - - // Open file in append mode - let file = match OpenOptions::new().append(true).open(output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e), - ) - .with_advice( - "Make sure the archive file exists and you have write permission.".to_string(), - )]; - } - }; - - // Create write context and process files - let mut ctx = WriteContext::with_diagnostics( - file, - read_result.final_state, - read_result.observation_count, - snapshot_interval, - FinishStrategy::FlushOnly, - read_result.diagnostics, - ); - - let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&file_refs) { - return diagnostics; - } - - match ctx.finish() { - Ok(collector) => collector.into_diagnostics(), - Err(diagnostics) => diagnostics, - } -} - -/// Append to a compressed archive. -/// -/// This reads the entire archive (decompressing), writes everything to a -/// new compressed temp file with the new observations, then atomically -/// swaps the temp file with the original. -#[cfg(feature = "compression")] -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - format: CompressionFormat, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Step 1: Open and decompress the archive, reading all bytes - let opened = match open_archive(archive_path) { - Ok(o) => o, - Err(diag) => return vec![diag], - }; - - // Read all decompressed bytes into memory - let mut decompressed_bytes = Vec::new(); - let mut reader = opened.reader; - if let Err(e) = reader.read_to_end(&mut decompressed_bytes) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - - // Step 2: Parse the archive to get final state using AppendSeek mode - // We need to re-read from the decompressed bytes - let archive_reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create archive reader: {}", e), - )]; - } - }; - - let read_result = match archive_reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't parse the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive." - .to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // Step 3: Create temp file with same compression format - let temp_path = generate_temp_filename(output_path); - let temp_file = match File::create(&temp_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create temp file: {}", e), - )]; - } - }; - - // Create compressed writer - let compressed_writer = match CompressedWriter::new(format, temp_file) { - Ok(w) => w, - Err(diag) => { - let _ = std::fs::remove_file(&temp_path); - return vec![diag]; - } - }; - - // Step 4: Create write context and copy old data + write new observations - let mut ctx = CompressedWriteContext::new( - compressed_writer, - read_result.final_state, - read_result.observation_count, - snapshot_interval, - FinishStrategy::AtomicReplace { - temp_path: temp_path.clone(), - output_path: output_path.to_path_buf(), - }, - read_result.diagnostics, - ); - - // Write all old decompressed bytes first - if let Err(diagnostics) = ctx.write_raw(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - - // Write new observations - let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&file_refs) { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - - // Finish (this handles compression finalization and atomic swap) - match ctx.finish() { - Ok(collector) => collector.into_diagnostics(), - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - diagnostics - } - } -} - -/// Stub for when compression feature is not enabled. -#[cfg(not(feature = "compression"))] -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - _new_files: &[Q], - _output_path: P, - format: CompressionFormat, - _snapshot_interval: Option, -) -> Vec { - let format_name = match format { - CompressionFormat::Gzip => "gzip", - CompressionFormat::Deflate => "deflate", - CompressionFormat::Zlib => "zlib", - CompressionFormat::Brotli => "brotli", - CompressionFormat::Zstd => "zstd", - CompressionFormat::None => unreachable!(), - }; - - vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!( - "I detected a {}-compressed archive, but this build doesn't support compression.", - format_name - ), - ) - .with_location(archive_path.as_ref().display().to_string(), 1) - .with_advice( - "This binary was built without compression support.\n\ - Install with compression: cargo install json-archive --features compression\n\ - Or decompress the file first." - .to_string(), - )] -} - -/// Generate default output filename from input filename. -/// -/// - `test.json` -> `test.json.archive` -/// - `test.txt` -> `test.txt.json.archive` -/// - `test` -> `test.json.archive` -/// - `test.json.archive` -> `test.json.archive` (unchanged) -pub fn default_output_filename>(input_path: P) -> PathBuf { - let path = input_path.as_ref(); - let mut output = path.to_path_buf(); - - // If it already ends with .json.archive, don't modify it - if let Some(filename) = path.file_name() { - if let Some(filename_str) = filename.to_str() { - if filename_str.ends_with(".json.archive") { - return output; - } - } - } - - // Add .json.archive extension - if let Some(extension) = path.extension() { - if extension == "json" { - // Replace .json with .json.archive - output.set_extension("json.archive"); - } else { - // Append .json.archive to whatever extension exists - let new_extension = format!("{}.json.archive", extension.to_string_lossy()); - output.set_extension(new_extension); - } - } else { - // No extension, just add .json.archive - output.set_extension("json.archive"); - } - - output -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - use std::io::Write as IoWrite; - use tempfile::NamedTempFile; - - #[test] - fn test_create_archive_single_file() -> Result<(), Box> { - // Create input file - let mut input_file = NamedTempFile::new()?; - writeln!(input_file, r#"{{"count": 0, "name": "test"}}"#)?; - input_file.flush()?; - - // Create output file - let output_file = NamedTempFile::new()?; - - let diagnostics = create_archive( - &[input_file.path()], - output_file.path(), - Some("test-source".to_string()), - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify the output - let content = std::fs::read_to_string(output_file.path())?; - let header: Header = serde_json::from_str(content.lines().next().unwrap())?; - assert_eq!(header.file_type, "@peoplesgrocers/json-archive"); - assert_eq!(header.version, 1); - assert_eq!(header.initial, json!({"count": 0, "name": "test"})); - - Ok(()) - } - - #[test] - fn test_create_archive_multiple_files() -> Result<(), Box> { - // Create input files - let mut file1 = NamedTempFile::new()?; - let mut file2 = NamedTempFile::new()?; - writeln!(file1, r#"{{"count": 0}}"#)?; - writeln!(file2, r#"{{"count": 1}}"#)?; - file1.flush()?; - file2.flush()?; - - let output_file = NamedTempFile::new()?; - - let diagnostics = create_archive( - &[file1.path(), file2.path()], - output_file.path(), - None, - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify output has header + observation events - let content = std::fs::read_to_string(output_file.path())?; - let lines: Vec<&str> = content.lines().collect(); - assert!(lines.len() >= 3); // header + comment + observe + change - - // First line should be header - let header: Header = serde_json::from_str(lines[0])?; - assert_eq!(header.initial, json!({"count": 0})); - - // Should contain observe and change events - assert!(content.contains("observe")); - assert!(content.contains("change")); - assert!(content.contains("/count")); - - Ok(()) - } - - #[test] - fn test_append_to_uncompressed_archive() -> Result<(), Box> { - // Create initial archive - let mut archive_file = NamedTempFile::new()?; - let header = Header::new(json!({"count": 0}), None); - writeln!(archive_file, "{}", serde_json::to_string(&header)?)?; - archive_file.flush()?; - - // Create file to append - let mut new_file = NamedTempFile::new()?; - writeln!(new_file, r#"{{"count": 1}}"#)?; - new_file.flush()?; - - let diagnostics = append_to_archive( - archive_file.path(), - &[new_file.path()], - archive_file.path(), - None, - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify the archive was updated - let content = std::fs::read_to_string(archive_file.path())?; - assert!(content.contains("observe")); - assert!(content.contains("change")); - assert!(content.contains("/count")); - - Ok(()) - } - - #[test] - fn test_default_output_filename() { - assert_eq!( - default_output_filename("test.json"), - PathBuf::from("test.json.archive") - ); - - assert_eq!( - default_output_filename("test.txt"), - PathBuf::from("test.txt.json.archive") - ); - - assert_eq!( - default_output_filename("test"), - PathBuf::from("test.json.archive") - ); - - assert_eq!( - default_output_filename("test.json.archive"), - PathBuf::from("test.json.archive") - ); - } -} diff --git a/src/archive_reader.rs b/src/archive_reader.rs index 7ea419f..d434b59 100644 --- a/src/archive_reader.rs +++ b/src/archive_reader.rs @@ -21,22 +21,12 @@ use serde_json::Value; use std::collections::HashSet; -use std::fs::File; -use std::io::{BufRead, BufReader, Read}; -use std::path::Path; +use std::io::BufRead; use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; use crate::event_deserialize::EventDeserializer; use crate::events::{Event, Header}; use crate::pointer::JsonPointer; -use crate::detection::{CompressionFormat, detect_compression_format}; - -#[cfg(feature = "compression")] -use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder}; -#[cfg(feature = "compression")] -use brotli::Decompressor; -#[cfg(feature = "compression")] -use zstd::stream::read::Decoder as ZstdDecoder; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ReadMode { @@ -44,11 +34,6 @@ pub enum ReadMode { AppendSeek, } -pub struct ArchiveReader { - mode: ReadMode, - filename: String, -} - #[derive(Debug)] pub struct ReadResult { pub header: Header, @@ -57,15 +42,15 @@ pub struct ReadResult { pub observation_count: usize, } -pub struct EventIterator { - reader: Box, +pub struct EventIterator { + reader: R, pub diagnostics: DiagnosticCollector, pub header: Header, filename: String, line_number: usize, } -impl Iterator for EventIterator { +impl Iterator for EventIterator { type Item = Event; fn next(&mut self) -> Option { @@ -86,7 +71,8 @@ impl Iterator for EventIterator { } // Try to parse as event - let event_deserializer = match serde_json::from_str::(&line) { + let event_deserializer = match serde_json::from_str::(&line) + { Ok(d) => d, Err(e) => { self.diagnostics.add( @@ -113,7 +99,7 @@ impl Iterator for EventIterator { self.diagnostics.add( diagnostic .with_location(self.filename.clone(), self.line_number) - .with_snippet(format!("{} | {}", self.line_number, line.trim())) + .with_snippet(format!("{} | {}", self.line_number, line.trim())), ); } @@ -146,431 +132,355 @@ impl Iterator for EventIterator { } } -impl ArchiveReader { - pub fn new>(path: P, mode: ReadMode) -> std::io::Result { - let filename = path.as_ref().display().to_string(); - Ok(Self { mode, filename }) - } +/// Parse header and create event iterator from any BufRead source. +pub fn read_events( + mut reader: R, + filename: &str, +) -> Result<(Value, EventIterator), Diagnostic> { + let mut header_line = String::new(); + let mut line_number = 0; - pub fn events>(&self, path: P) -> std::io::Result<(Value, EventIterator)> { - let path = path.as_ref(); - let mut file = File::open(path)?; + // Skip comment lines until we find the header + loop { + header_line.clear(); + line_number += 1; - // Detect compression format - let mut magic_bytes = [0u8; 4]; - let bytes_read = file.read(&mut magic_bytes)?; - let compression_format = detect_compression_format(path, &magic_bytes[..bytes_read]); - - // Re-open file to reset position - file = File::open(path)?; - - let mut diagnostics = DiagnosticCollector::new(); - - // Check if compression is detected but not supported - #[cfg(not(feature = "compression"))] - if compression_format != CompressionFormat::None { - let format_name = match compression_format { - CompressionFormat::Gzip => "gzip", - CompressionFormat::Deflate => "deflate", - CompressionFormat::Zlib => "zlib", - CompressionFormat::Brotli => "brotli", - CompressionFormat::Zstd => "zstd", - CompressionFormat::None => unreachable!(), - }; - - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("I detected a {}-compressed archive, but this build doesn't support compression.", format_name) - ) - .with_location(self.filename.clone(), 1) - .with_advice( - "This binary was built without compression support to reduce binary size and dependencies.\n\ - You have two options:\n\ - 1. Install the version with compression support: cargo install json-archive --features compression\n\ - 2. Manually decompress the file first, then use this tool on the uncompressed archive" - .to_string() - ) - ); - - // Return dummy values with fatal diagnostic - let iterator = EventIterator { - reader: Box::new(BufReader::new(std::io::empty())), - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); - } - - // Create appropriate reader based on compression format - #[cfg(feature = "compression")] - let reader: Box = match compression_format { - CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))), - CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))), - CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))), - CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))), - CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)), - CompressionFormat::None => Box::new(BufReader::new(file)), - }; - - #[cfg(not(feature = "compression"))] - let reader: Box = Box::new(BufReader::new(file)); - - let mut reader = reader; - let mut header_line = String::new(); - - let _bytes_read = match reader.read_line(&mut header_line) { + match reader.read_line(&mut header_line) { Ok(0) => { - // Empty file - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::EmptyFile, - "I found an empty file, but I need at least a header line.".to_string(), - ) - .with_location(self.filename.clone(), 1) - .with_advice( - "See the file format specification for header structure." - .to_string(), - ), - ); - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); + // Empty file or only comments + return Err(Diagnostic::fatal( + DiagnosticCode::EmptyFile, + "I found an empty file (or only comments), but I need at least a header line." + .to_string(), + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "See the file format specification for header structure.".to_string(), + )); } - Ok(n) => n, + Ok(_) => {} Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { // UTF-8 error - diagnostics.add( + return Err( Diagnostic::fatal( DiagnosticCode::InvalidUtf8, - "I found invalid UTF-8 bytes at line 1.".to_string() + format!("I found invalid UTF-8 bytes at line {}.", line_number) ) - .with_location(self.filename.clone(), 1) + .with_location(filename.to_string(), line_number) .with_advice( "The JSON Archive format requires UTF-8 encoding. Make sure the file \ was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding." .to_string() ) ); - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); } - Err(e) => return Err(e), - }; - - let header = match self.parse_header(&header_line, 1, &mut diagnostics) { - Some(h) => h, - None => { - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't read from the archive: {}", e), + ) + .with_location(filename.to_string(), line_number)); } }; - let iterator = EventIterator { - reader, - diagnostics, - header: header.clone(), - filename: self.filename.clone(), - line_number: 1, - }; - - Ok((header.initial, iterator)) + // Skip comment lines (lines starting with #) + let trimmed = header_line.trim_start(); + if !trimmed.starts_with('#') { + break; + } } - pub fn read>(&self, path: P) -> std::io::Result { - let (initial_value, mut event_iter) = self.events(&path)?; + let header = parse_header(filename, &header_line, line_number)?; - // Check for early fatal diagnostics (like compression not supported) - if event_iter.diagnostics.has_fatal() { - return Ok(ReadResult { - header: Header::new(Value::Null, None), - final_state: Value::Null, - diagnostics: event_iter.diagnostics, - observation_count: 0, - }); - } + let iterator = EventIterator { + reader, + diagnostics: DiagnosticCollector::new(), + header: header.clone(), + filename: filename.to_string(), + line_number, + }; - let header = Header::new(initial_value.clone(), None); - let mut state = initial_value; - let mut seen_observations: HashSet = HashSet::new(); - let mut current_observation: Option<(String, usize, usize)> = None; - let mut events_in_observation = 0; - let mut observation_count = 0; + Ok((header.initial, iterator)) +} - // Process events from iterator - while let Some(event) = event_iter.next() { - let line_number = event_iter.line_number; +/// Read all events and return final state. +pub fn read_archive( + reader: R, + filename: &str, + mode: ReadMode, +) -> Result { + let (initial_value, mut event_iter) = read_events(reader, filename)?; - match event { - Event::Observe { observation_id, timestamp: _, change_count } => { - if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { - if events_in_observation != *expected_count { - event_iter.diagnostics.add( - Diagnostic::new( - DiagnosticLevel::Warning, - DiagnosticCode::ChangeCountMismatch, - format!( - "The observe event at line {} declared {} changes, but I found {}.", - obs_line, expected_count, events_in_observation - ) - ) - .with_location(self.filename.clone(), *obs_line) - .with_advice( - "Make sure the change_count in the observe event matches the number of \ - add/change/remove/move events that follow it." - .to_string() - ) - ); - } - } + let header = Header::new(initial_value.clone(), None); + let mut state = initial_value; + let mut seen_observations: HashSet = HashSet::new(); + let mut current_observation: Option<(String, usize, usize)> = None; + let mut events_in_observation = 0; + let mut observation_count = 0; - if seen_observations.contains(&observation_id) { + // Process events from iterator + while let Some(event) = event_iter.next() { + let line_number = event_iter.line_number; + + match event { + Event::Observe { + observation_id, + timestamp: _, + change_count, + } => { + if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { + if events_in_observation != *expected_count { event_iter.diagnostics.add( Diagnostic::new( DiagnosticLevel::Warning, - DiagnosticCode::DuplicateObservationId, - format!("I found a duplicate observation ID: '{}'", observation_id), + DiagnosticCode::ChangeCountMismatch, + format!( + "The observe event at line {} declared {} changes, but I found {}.", + obs_line, expected_count, events_in_observation + ) ) - .with_location(self.filename.clone(), line_number) + .with_location(filename.to_string(), *obs_line) .with_advice( - "Each observation ID should be unique within the archive. \ - Consider using UUIDs or timestamps to ensure uniqueness." - .to_string(), - ), - ); - } - - seen_observations.insert(observation_id.clone()); - current_observation = Some((observation_id, line_number, change_count)); - events_in_observation = 0; - observation_count += 1; - } - - Event::Add { path, value, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - .with_advice( - "Each add/change/remove/move event must reference an observation ID from a preceding observe event." - .to_string() - ) - ); - continue; - } - - if let Err(diag) = apply_add(&mut state, &path, value) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Change { path, new_value, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_change(&mut state, &path, new_value) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Remove { path, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_remove(&mut state, &path) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Move { path, moves, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_move(&mut state, &path, moves) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Snapshot { observation_id: _, timestamp: _, object } => { - if self.mode == ReadMode::FullValidation && state != object { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::SnapshotStateMismatch, - "I found a snapshot whose state doesn't match the replayed state up to this point.".to_string() - ) - .with_location(self.filename.clone(), line_number) - .with_advice( - "This could indicate corruption or that events were applied incorrectly. \ - The snapshot state should exactly match the result of replaying all events \ - from the initial state." + "Make sure the change_count in the observe event matches the number of \ + add/change/remove/move events that follow it." .to_string() ) ); } - - state = object; } - } - } - if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { - if events_in_observation != *expected_count { - event_iter.diagnostics.add( - Diagnostic::new( - DiagnosticLevel::Warning, - DiagnosticCode::ChangeCountMismatch, - format!( - "The observe event at line {} declared {} changes, but I found {}.", - obs_line, expected_count, events_in_observation - ), - ) - .with_location(self.filename.clone(), *obs_line), - ); - } - } - - Ok(ReadResult { - header, - final_state: state, - diagnostics: event_iter.diagnostics, - observation_count, - }) - } - - fn parse_header( - &self, - line: &str, - line_number: usize, - diagnostics: &mut DiagnosticCollector, - ) -> Option
{ - let value: Value = match serde_json::from_str(line) { - Ok(v) => v, - Err(e) => { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::MissingHeader, - format!("I couldn't parse the header as JSON: {}", e), - ) - .with_location(self.filename.clone(), line_number) - .with_snippet(format!("{} | {}", line_number, line)) - .with_advice( - "The first line must be a JSON object containing the archive header.\n\ - Required fields: type, version, created, initial" - .to_string(), - ), - ); - return None; - } - }; - - match serde_json::from_value::
(value.clone()) { - Ok(header) => { - if header.version != 1 { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("I found version {}, but I only support version 1.", header.version) + if seen_observations.contains(&observation_id) { + event_iter.diagnostics.add( + Diagnostic::new( + DiagnosticLevel::Warning, + DiagnosticCode::DuplicateObservationId, + format!("I found a duplicate observation ID: '{}'", observation_id), ) - .with_location(self.filename.clone(), line_number) + .with_location(filename.to_string(), line_number) .with_advice( - "This archive was created with a newer or older version of the format. \ - You may need to upgrade your tools or convert the archive." + "Each observation ID should be unique within the archive. \ + Consider using UUIDs or timestamps to ensure uniqueness." + .to_string(), + ), + ); + } + + seen_observations.insert(observation_id.clone()); + current_observation = Some((observation_id, line_number, change_count)); + events_in_observation = 0; + observation_count += 1; + } + + Event::Add { + path, + value, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "Each add/change/remove/move event must reference an observation ID from a preceding observe event." .to_string() ) ); - return None; + continue; } - Some(header) + if let Err(diag) = apply_add(&mut state, &path, value) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } } - Err(e) => { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - format!("I couldn't parse the header: {}", e), - ) - .with_location(self.filename.clone(), line_number) - .with_snippet(format!("{} | {}", line_number, line)) - .with_advice( - "The header must contain:\n\ - - type: \"@peoplesgrocers/json-archive\"\n\ - - version: 1\n\ - - created: an ISO-8601 timestamp\n\ - - initial: the initial state object" - .to_string(), - ), - ); - None + + Event::Change { + path, + new_value, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_change(&mut state, &path, new_value) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Remove { + path, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_remove(&mut state, &path) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Move { + path, + moves, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_move(&mut state, &path, moves) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Snapshot { + observation_id: _, + timestamp: _, + object, + } => { + if mode == ReadMode::FullValidation && state != object { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::SnapshotStateMismatch, + "I found a snapshot whose state doesn't match the replayed state up to this point.".to_string() + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "This could indicate corruption or that events were applied incorrectly. \ + The snapshot state should exactly match the result of replaying all events \ + from the initial state." + .to_string() + ) + ); + } + + state = object; } } } + if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { + if events_in_observation != *expected_count { + event_iter.diagnostics.add( + Diagnostic::new( + DiagnosticLevel::Warning, + DiagnosticCode::ChangeCountMismatch, + format!( + "The observe event at line {} declared {} changes, but I found {}.", + obs_line, expected_count, events_in_observation + ), + ) + .with_location(filename.to_string(), *obs_line), + ); + } + } + + Ok(ReadResult { + header, + final_state: state, + diagnostics: event_iter.diagnostics, + observation_count, + }) +} + +fn parse_header(filename: &str, line: &str, line_number: usize) -> Result { + let value: Value = serde_json::from_str(line).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::MissingHeader, + format!("I couldn't parse the header as JSON: {}", e), + ) + .with_location(filename.to_string(), line_number) + .with_snippet(format!("{} | {}", line_number, line)) + .with_advice( + "The first line must be a JSON object containing the archive header.\n\ + Required fields: type, version, created, initial" + .to_string(), + ) + })?; + + let header = serde_json::from_value::
(value).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::MissingHeaderField, + format!("I couldn't parse the header: {}", e), + ) + .with_location(filename.to_string(), line_number) + .with_snippet(format!("{} | {}", line_number, line)) + .with_advice( + "The header must contain:\n\ + - type: \"@peoplesgrocers/json-archive\"\n\ + - version: 1\n\ + - created: an ISO-8601 timestamp\n\ + - initial: the initial state object" + .to_string(), + ) + })?; + + if header.version != 1 { + return Err(Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + format!( + "I found version {}, but I only support version 1.", + header.version + ), + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "This archive was created with a newer or older version of the format. \ + You may need to upgrade your tools or convert the archive." + .to_string(), + )); + } + + Ok(header) } pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diagnostic> { @@ -578,7 +488,7 @@ pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diag diag.with_advice( "JSON Pointer paths must start with '/' and use '/' to separate segments.\n\ Special characters: use ~0 for ~ and ~1 for /" - .to_string() + .to_string(), ) })?; @@ -586,7 +496,7 @@ pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diag diag.with_advice( "For add operations, the parent path must exist. \ For example, to add /a/b/c, the paths /a and /a/b must already exist." - .to_string() + .to_string(), ) }) } @@ -613,20 +523,18 @@ pub fn apply_move( let array = pointer.get_mut(state)?; if !array.is_array() { - return Err( - Diagnostic::fatal( - DiagnosticCode::MoveOnNonArray, - format!( - "I can't apply move operations to '{}' because it's not an array.", - path - ), - ) - .with_advice( - "Move operations can only reorder elements within an array. \ - The path must point to an array value." - .to_string(), + return Err(Diagnostic::fatal( + DiagnosticCode::MoveOnNonArray, + format!( + "I can't apply move operations to '{}' because it's not an array.", + path ), - ); + ) + .with_advice( + "Move operations can only reorder elements within an array. \ + The path must point to an array value." + .to_string(), + )); } let arr = array.as_array_mut().unwrap(); @@ -659,7 +567,11 @@ pub fn apply_move( // Apply moves now that we know they're all valid for (from_idx, to_idx) in moves { let element = arr.remove(from_idx); - let insert_idx = if to_idx > from_idx { to_idx - 1 } else { to_idx }; + let insert_idx = if to_idx > from_idx { + to_idx - 1 + } else { + to_idx + }; arr.insert(insert_idx, element); } @@ -670,91 +582,114 @@ pub fn apply_move( mod tests { use super::*; use serde_json::json; - use std::io::Write; + use std::fs::File; + use std::io::{BufReader, Write}; use tempfile::NamedTempFile; #[test] - fn test_read_valid_archive() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_read_valid_archive() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), Some("test".to_string())); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 1]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert_eq!(result.final_state, json!({"count": 1})); assert_eq!(result.observation_count, 1); assert!(!result.diagnostics.has_fatal()); - - Ok(()) } #[test] - fn test_empty_file() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; + fn test_empty_file() { + let temp_file = NamedTempFile::new().unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ); - assert!(result.diagnostics.has_fatal()); - assert_eq!(result.diagnostics.len(), 1); - - Ok(()) + assert!(result.is_err()); } #[test] - fn test_non_existent_observation_id() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_non_existent_observation_id() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert!(result.diagnostics.has_fatal()); - - Ok(()) } #[test] - fn test_append_mode_ignores_observation_id() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_append_mode_ignores_observation_id() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::AppendSeek)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::AppendSeek, + ) + .unwrap(); assert!(!result.diagnostics.has_fatal()); assert_eq!(result.final_state, json!({"count": 1})); - - Ok(()) } #[test] - fn test_change_count_mismatch() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_change_count_mismatch() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 2]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); let warnings: Vec<_> = result .diagnostics @@ -764,28 +699,31 @@ mod tests { .collect(); assert_eq!(warnings.len(), 1); - - Ok(()) } #[test] - fn test_simple_change() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_simple_change() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 5}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 1]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert!(!result.diagnostics.has_fatal()); assert_eq!(result.final_state, json!({"count": 1})); - - Ok(()) } } diff --git a/src/archive_writer.rs b/src/archive_writer.rs index 531809b..63b807d 100644 --- a/src/archive_writer.rs +++ b/src/archive_writer.rs @@ -19,112 +19,51 @@ // marxism@peoplesgrocers.com // -use chrono::{Utc, DateTime}; +use chrono::{DateTime, Utc}; use serde_json::Value; -use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Write, Read, Seek, SeekFrom}; +use std::io::Write; use std::path::{Path, PathBuf}; use uuid::Uuid; -use crate::atomic_file::{atomic_replace_file, generate_temp_filename}; use crate::diagnostics::{Diagnostic, DiagnosticCode}; use crate::diff; use crate::events::{Event, Header, Observation}; -use crate::archive_reader::{ArchiveReader, ReadMode}; -use crate::detection::{CompressionFormat, detect_compression_format}; pub struct ArchiveWriter { - writer: BufWriter, observation_count: usize, - snapshot_interval: Option, filename: String, } impl ArchiveWriter { - pub fn new>( - path: P, - snapshot_interval: Option, - ) -> Result> { - let filename = path.as_ref().display().to_string(); - let file = match File::create(&path) { - Ok(f) => f, - Err(e) => { - let diagnostic = Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create the output file: {}", e) - ) - .with_advice( - "Make sure you have write permission in this directory and that the path is valid." - .to_string() - ); - return Err(vec![diagnostic]); - } - }; - let writer = BufWriter::new(file); - - Ok(Self { - writer, - observation_count: 0, - snapshot_interval, - filename, - }) - } - - pub fn new_append>( - path: P, - snapshot_interval: Option, - current_observation_count: usize, - ) -> Result> { - let filename = path.as_ref().display().to_string(); - let file = match OpenOptions::new().append(true).open(&path) { - Ok(f) => f, - Err(e) => { - let diagnostic = Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e) - ) - .with_advice( - "Make sure the archive file exists and you have write permission." - .to_string() - ); - return Err(vec![diagnostic]); - } - }; - let writer = BufWriter::new(file); - - Ok(Self { - writer, - observation_count: current_observation_count, - snapshot_interval, - filename, - }) - } - - pub fn write_header(&mut self, header: &Header) -> Result<(), Vec> { + pub fn write_header(&self, writer: &mut impl Write, header: &Header) -> Result<(), Diagnostic> { let header_json = match serde_json::to_string(header) { Ok(json) => json, Err(e) => { - return Err(vec![Diagnostic::fatal( + return Err(Diagnostic::fatal( DiagnosticCode::InvalidEventJson, format!("I couldn't serialize the header to JSON: {}", e), ) - .with_location(self.filename.clone(), 1)]); + .with_location(self.filename.clone(), 1)); } }; - if let Err(e) = writeln!(self.writer, "{}", header_json) { - return Err(vec![Diagnostic::fatal( + if let Err(e) = writeln!(writer, "{}", header_json) { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), ) - .with_location(self.filename.clone(), 1)]); + .with_location(self.filename.clone(), 1)); } Ok(()) } - pub fn write_comment(&mut self, comment: &str) -> Result<(), Vec> { - if let Err(e) = writeln!(self.writer, "# {}", comment) { + pub fn write_comment( + &self, + writer: &mut impl Write, + comment: &str, + ) -> Result<(), Vec> { + if let Err(e) = writeln!(writer, "# {}", comment) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -133,7 +72,11 @@ impl ArchiveWriter { Ok(()) } - pub fn write_observation(&mut self, observation: Observation) -> Result<(), Vec> { + pub fn write_observation( + &mut self, + writer: &mut impl Write, + observation: Observation, + ) -> Result<(), Vec> { let events = observation.to_events(); for event in events { @@ -147,7 +90,7 @@ impl ArchiveWriter { } }; - if let Err(e) = writeln!(self.writer, "{}", event_json) { + if let Err(e) = writeln!(writer, "{}", event_json) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -159,7 +102,11 @@ impl ArchiveWriter { Ok(()) } - pub fn write_snapshot(&mut self, object: &Value) -> Result<(), Vec> { + pub fn write_snapshot( + &self, + writer: &mut impl Write, + object: &Value, + ) -> Result<(), Vec> { let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); let snapshot = Event::Snapshot { observation_id: snapshot_id, @@ -177,7 +124,7 @@ impl ArchiveWriter { } }; - if let Err(e) = writeln!(self.writer, "{}", event_json) { + if let Err(e) = writeln!(writer, "{}", event_json) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -186,94 +133,6 @@ impl ArchiveWriter { Ok(()) } - - pub fn should_write_snapshot(&self) -> bool { - if let Some(interval) = self.snapshot_interval { - self.observation_count > 0 && self.observation_count % interval == 0 - } else { - false - } - } - - pub fn finish(mut self) -> Result<(), Vec> { - if let Err(e) = self.writer.flush() { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )]); - } - Ok(()) - } -} - -pub struct ArchiveBuilder { - initial_state: Option, - current_state: Value, - source: Option, - snapshot_interval: Option, -} - -impl ArchiveBuilder { - pub fn new() -> Self { - Self { - initial_state: None, - current_state: Value::Null, - source: None, - snapshot_interval: None, - } - } - - pub fn with_source(mut self, source: String) -> Self { - self.source = Some(source); - self - } - - pub fn with_snapshot_interval(mut self, interval: usize) -> Self { - self.snapshot_interval = Some(interval); - self - } - - pub fn add_state(&mut self, state: Value) -> Option { - if self.initial_state.is_none() { - self.initial_state = Some(state.clone()); - self.current_state = state; - return None; - } - - let observation_id = format!("obs-{}", Uuid::new_v4()); - let timestamp = Utc::now(); - - let diff_result: Vec = diff::diff(&self.current_state, &state, "", &observation_id); - self.current_state = state; - - let mut observation = Observation::new(observation_id, timestamp); - for event in diff_result { - observation.add_event(event); - } - - Some(observation) - } - - pub fn build>(self, output_path: P) -> Result<(), Vec> { - if self.initial_state.is_none() { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - "I can't build an archive without any initial state.".to_string(), - )]); - } - - let header = Header::new(self.initial_state.unwrap(), self.source); - - let mut writer = ArchiveWriter::new(output_path, self.snapshot_interval)?; - writer.write_header(&header)?; - writer.finish()?; - - Ok(()) - } - - pub fn get_initial_state(&self) -> Option<&Value> { - self.initial_state.as_ref() - } } /// Generate default output filename from input filename @@ -290,36 +149,20 @@ pub fn default_output_filename>(input_path: P) -> PathBuf { } } - // Add .json.archive extension if let Some(extension) = path.extension() { if extension == "json" { - // Replace .json with .json.archive output.set_extension("json.archive"); } else { - // Append .json.archive to whatever extension exists let new_extension = format!("{}.json.archive", extension.to_string_lossy()); output.set_extension(new_extension); } } else { - // No extension, just add .json.archive output.set_extension("json.archive"); } output } -/// Detect if a file is compressed by checking magic bytes -/// Uses the existing compression detection from reader.rs -fn is_compressed>(path: P) -> std::io::Result { - let path = path.as_ref(); - let mut file = File::open(path)?; - let mut magic_bytes = [0u8; 4]; - let bytes_read = file.read(&mut magic_bytes)?; - - let format = detect_compression_format(path, &magic_bytes[..bytes_read]); - Ok(format != CompressionFormat::None) -} - /// Get the file modification time as a DateTime fn get_file_mtime>(path: P) -> std::io::Result> { let metadata = std::fs::metadata(path)?; @@ -343,676 +186,268 @@ fn get_file_mtime>(path: P) -> std::io::Result> { /// # Returns /// /// Returns the number of observations written -fn write_observations_to_writer>( +pub fn write_observation>( writer: &mut W, - current_state: Value, - new_files: &[P], - mut observation_count: usize, + observation_count: &mut usize, snapshot_interval: Option, -) -> Result> { - let mut builder = ArchiveBuilder::new(); - builder.current_state = current_state.clone(); - builder.initial_state = Some(current_state); - - for file_path in new_files.iter() { - // Write comment - let comment = format!("# Processing file: {:?}\n", file_path.as_ref()); - if let Err(e) = writer.write_all(comment.as_bytes()) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - - // Get file modification time - let file_mtime = match get_file_mtime(file_path) { - Ok(mtime) => mtime, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get the file modification time: {}", e), - )]); - } - }; - - // Read and parse new state - let content = match std::fs::read_to_string(file_path) { - Ok(content) => content, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the input file: {}", e), - )]); - } - }; - - let state: Value = match serde_json::from_str(&content) { - Ok(state) => state, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse the input file as JSON: {}", e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())]); - } - }; - - // Generate and write observation - if let Some(mut observation) = builder.add_state(state.clone()) { - // Override the timestamp with the file modification time - observation.timestamp = file_mtime; - observation_count += 1; - - // Write observation events - for event in observation.to_events() { - let event_json = match serde_json::to_string(&event) { - Ok(json) => json, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize an event to JSON: {}", e), - )]); - } - }; - - if let Err(e) = writeln!(writer, "{}", event_json) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - } - - // Check if we should write a snapshot - if let Some(interval) = snapshot_interval { - if observation_count > 0 && observation_count % interval == 0 { - let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); - let snapshot = Event::Snapshot { - observation_id: snapshot_id, - timestamp: file_mtime, - object: state.clone(), - }; - - let snapshot_json = match serde_json::to_string(&snapshot) { - Ok(json) => json, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the snapshot to JSON: {}", e), - )]); - } - }; - - if let Err(e) = writeln!(writer, "{}", snapshot_json) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - } - } - } - } - - Ok(observation_count) -} - -pub fn create_archive_from_files>( - input_files: &[P], - output_path: P, + current_state: &Value, + filename: &P, source: Option, - snapshot_interval: Option, -) -> Result<(), Vec> { - let mut builder = ArchiveBuilder::new(); - if let Some(source) = source { - builder = builder.with_source(source); - } - if let Some(interval) = snapshot_interval { - builder = builder.with_snapshot_interval(interval); - } +) -> Result { + // Get file modification time + let file_mtime = match get_file_mtime(filename) { + Ok(mtime) => mtime, + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't get the file modification time: {}", e), + )); + } + }; - let first_content = std::fs::read_to_string(&input_files[0]).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the first input file: {}", e), - )] - })?; - - let first_state: Value = serde_json::from_str(&first_content).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse the first input file as JSON: {}", e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - let _ = builder.add_state(first_state.clone()); - - let header = Header::new(first_state, builder.source.clone()); - let mut writer = ArchiveWriter::new(&output_path, builder.snapshot_interval)?; - writer.write_header(&header)?; - - for file_path in input_files[1..].iter() { - writer.write_comment(&format!("Processing file: {:?}", file_path.as_ref()))?; - - let content = std::fs::read_to_string(file_path).map_err(|e| { - vec![Diagnostic::fatal( + let content = match std::fs::read_to_string(filename) { + Ok(content) => content, + Err(e) => { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't read the input file: {}", e), - )] - })?; + )); + } + }; - let state: Value = serde_json::from_str(&content).map_err(|e| { - vec![Diagnostic::fatal( + let state: Value = match serde_json::from_str(&content) { + Ok(state) => state, + Err(e) => { + return Err(Diagnostic::fatal( DiagnosticCode::InvalidEventJson, format!("I couldn't parse the input file as JSON: {}", e), ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - if let Some(observation) = builder.add_state(state.clone()) { - writer.write_observation(observation)?; - - if writer.should_write_snapshot() { - writer.write_snapshot(&state)?; - } - } - } - - writer.finish()?; - Ok(()) -} - -/// This reads the entire compressed archive, writes a new compressed -/// with all old events plus new observations to a temporary file, then -/// two phase commit style replace the original file. -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - _source: Option, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Step 1: Detect compression format and decompress entire file into memory - let mut file = match File::open(archive_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the compressed archive: {}", e), - )]; + .with_advice("Make sure the file contains valid JSON.".to_string())); } }; - let mut magic_bytes = [0u8; 4]; - let bytes_read = match file.read(&mut magic_bytes) { - Ok(n) => n, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - }; - let compression = detect_compression_format(archive_path, &magic_bytes[..bytes_read]); - file.seek(SeekFrom::Start(0)).unwrap(); - - let decompressed_bytes = { - - #[cfg(feature = "compression")] - { - use flate2::read::{GzDecoder, ZlibDecoder}; - use std::io::Read; - - let mut decompressed = Vec::new(); - - match compression { - CompressionFormat::Gzip => { - let mut decoder = GzDecoder::new(file); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress gzip archive: {}", e), - )]; - } - } - CompressionFormat::Zlib => { - let mut decoder = ZlibDecoder::new(file); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress zlib archive: {}", e), - )]; - } - } - CompressionFormat::Zstd => { - let mut decoder = match zstd::stream::read::Decoder::new(file) { - Ok(d) => d, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd decoder: {}", e), - )]; - } - }; - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress zstd archive: {}", e), - )]; - } - } - CompressionFormat::Brotli => { - let mut decoder = brotli::Decompressor::new(file, 4096); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress brotli archive: {}", e), - )]; - } - } - _ => { - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("Unsupported compression format: {:?}", compression), - )]; - } - } - - decompressed - } - - #[cfg(not(feature = "compression"))] - { - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "This build doesn't support compressed archives.".to_string(), - )]; - } - }; - - // Step 2 & 3: Use AppendSeek mode to parse minimally - // The reader will seek backward through the buffer to find snapshot - let reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create archive reader: {}", e), - )]; - } - }; - - let read_result = match reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive.".to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // Step 4: Write to temp file with compression - let temp_path = generate_temp_filename(output_path); - - #[cfg(feature = "compression")] - { - use flate2::write::{GzEncoder, ZlibEncoder}; - use flate2::Compression; - - // Create temp file with same compression format as original - let temp_file = match File::create(&temp_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create temp file: {}", e), - )]; - } + if *observation_count == 0 { + // TODO: See if we can get rid of this clone on the Value + let header = Header::new(state.clone(), source); + let aw = ArchiveWriter { + observation_count: *observation_count, + filename: filename.as_ref().display().to_string(), }; + aw.write_header(writer, &header)?; + *observation_count += 1; + } else { + let observation_id = format!("obs-{}", Uuid::new_v4()); - // Helper macro to reduce code duplication - macro_rules! write_compressed { - ($encoder:expr) => {{ - // Write all old decompressed bytes - if let Err(e) = $encoder.write_all(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write old archive data: {}", e), - )]; - } + let diff_result: Vec = diff::diff(current_state, &state, "", &observation_id); - // Write new observations using core logic - match write_observations_to_writer( - &mut $encoder, - read_result.final_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => {} - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - } - - // Finish compression - if let Err(e) = $encoder.finish() { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish compression: {}", e), - )]; - } - }}; + let mut observation = Observation::new(observation_id, file_mtime); + for event in diff_result { + observation.add_event(event); } - match compression { - CompressionFormat::Gzip => { - let mut encoder = GzEncoder::new(temp_file, Compression::default()); - write_compressed!(encoder); + *observation_count += 1; + + // Write observation events + for event in observation.to_events() { + let event_json = match serde_json::to_string(&event) { + Ok(json) => json, + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::InvalidEventJson, + format!("I couldn't serialize an event to JSON: {}", e), + )); + } + }; + + if let Err(e) = writeln!(writer, "{}", event_json) { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't write to the output: {}", e), + )); } - CompressionFormat::Zlib => { - let mut encoder = ZlibEncoder::new(temp_file, Compression::default()); - write_compressed!(encoder); - } - CompressionFormat::Zstd => { - let mut encoder = match zstd::stream::write::Encoder::new(temp_file, 0) { - Ok(e) => e, + } + + // Check if we should write a snapshot + if let Some(interval) = snapshot_interval { + if *observation_count > 0 && *observation_count % interval == 0 { + let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); + let snapshot = Event::Snapshot { + observation_id: snapshot_id, + timestamp: file_mtime, + object: state.clone(), + }; + + let snapshot_json = match serde_json::to_string(&snapshot) { + Ok(json) => json, Err(e) => { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd encoder: {}", e), - )]; + return Err(Diagnostic::fatal( + DiagnosticCode::InvalidEventJson, + format!("I couldn't serialize the snapshot to JSON: {}", e), + )); } }; - write_compressed!(encoder); - } - CompressionFormat::Brotli => { - // Brotli uses a different API - no finish() method - let mut encoder = brotli::CompressorWriter::new(temp_file, 4096, 11, 22); - - // Write all old decompressed bytes - if let Err(e) = encoder.write_all(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( + + if let Err(e) = writeln!(writer, "{}", snapshot_json) { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, - format!("I couldn't write old archive data: {}", e), - )]; + format!("I couldn't write to the output: {}", e), + )); } - - // Write new observations using core logic - match write_observations_to_writer( - &mut encoder, - read_result.final_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => {} - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - } - - // Flush the encoder (brotli auto-flushes on drop, but we flush explicitly) - if let Err(e) = encoder.flush() { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush brotli compression: {}", e), - )]; - } - // Encoder will be dropped here, completing the compression - } - _ => { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("Unsupported compression format for writing: {:?}", compression), - )]; } } } - #[cfg(not(feature = "compression"))] - { - let _ = temp_path; - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "This build doesn't support compressed archives.".to_string(), - )]; - } - - // Step 5: Atomic replace - match atomic_replace_file(output_path, &temp_path) { - Ok(()) => Vec::new(), - Err(diagnostics) => diagnostics, - } -} - -pub fn append_to_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - source: Option, - snapshot_interval: Option, -) -> Vec { - // Check if the archive is compressed - let is_archive_compressed = match is_compressed(&archive_path) { - Ok(compressed) => compressed, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't check if the archive is compressed: {}", e), - )]; - } - }; - - // If compressed, use the full rewrite strategy - if is_archive_compressed { - return append_to_compressed_archive( - &archive_path, - new_files, - &output_path, - source, - snapshot_interval, - ); - } - - // For uncompressed archives, use the direct append strategy (existing code) - // Read the existing archive to get the final state - let reader = match ArchiveReader::new(&archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive for reading: {}", e), - )]; - } - }; - - let read_result = match reader.read(&archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics in the archive - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive.".to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // If output path is different from archive path, copy the archive first - if archive_path.as_ref() != output_path.as_ref() { - if let Err(e) = std::fs::copy(&archive_path, &output_path) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't copy the archive to the output location: {}", e), - )]; - } - } - - // Open file in append mode - let mut file = match OpenOptions::new().append(true).open(&output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e), - ) - .with_advice( - "Make sure the archive file exists and you have write permission.".to_string() - )]; - } - }; - - // Use core writing logic - let current_state = read_result.final_state; - match write_observations_to_writer( - &mut file, - current_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => Vec::new(), - Err(diagnostics) => diagnostics, - } + Ok(state) } #[cfg(test)] mod tests { use super::*; use serde_json::json; - use std::io::Write; + use std::fs::File; + use std::io::{BufWriter, Write}; use tempfile::NamedTempFile; + /// Helper to create a temp file with JSON content + fn create_json_file(content: &Value) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "{}", serde_json::to_string(content).unwrap()).unwrap(); + file + } + #[test] - fn test_archive_writer_header() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; - let header = Header::new(json!({"test": "value"}), Some("test-source".to_string())); + fn test_single_file_creates_header_only() { + // When we have a single input file, the archive should contain just the header + // with that file's contents as the initial state + let input = create_json_file(&json!({"test": "value"})); + let output = NamedTempFile::new().unwrap(); + + let input_files = vec![input.path().to_path_buf()]; { - let mut writer = ArchiveWriter::new(temp_file.path(), None) - .map_err(|_| "Failed to create writer")?; - writer - .write_header(&header) - .map_err(|_| "Failed to write header")?; - writer.finish().map_err(|_| "Failed to finish")?; + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + file_path, + Some("test-source".to_string()), + ) + .unwrap(); + } + writer.flush().unwrap(); } - let content = std::fs::read_to_string(temp_file.path())?; - let lines: Vec<&str> = content.lines().collect(); + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); assert_eq!(lines.len(), 1); - let parsed_header: Header = serde_json::from_str(lines[0])?; + let parsed_header: Header = serde_json::from_str(lines[0]).unwrap(); assert_eq!(parsed_header.file_type, "@peoplesgrocers/json-archive"); assert_eq!(parsed_header.version, 1); assert_eq!(parsed_header.initial, json!({"test": "value"})); - - Ok(()) } #[test] - fn test_archive_builder() -> Result<(), Box> { - let mut builder = ArchiveBuilder::new(); + fn test_two_files_creates_header_and_observation() { + // When we have two input files, the first becomes the header's initial state + // and the second generates change events + let file1 = create_json_file(&json!({"count": 0, "name": "test"})); + let file2 = create_json_file(&json!({"count": 1, "name": "test"})); + let output = NamedTempFile::new().unwrap(); - // First state becomes initial - let result = builder.add_state(json!({"count": 0})); - assert!(result.is_none()); + let input_files = vec![file1.path().to_path_buf(), file2.path().to_path_buf()]; - // Second state generates observation - let observation = builder - .add_state(json!({"count": 1})) - .expect("Should generate observation"); - assert!(!observation.events.is_empty()); + { + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; - Ok(()) - } + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + file_path, + Some("test-source".to_string()), + ) + .unwrap(); + } + writer.flush().unwrap(); + } - #[test] - fn test_create_archive_from_files() -> Result<(), Box> { - // Create temporary input files - let mut file1 = NamedTempFile::new()?; - let mut file2 = NamedTempFile::new()?; - let output_file = NamedTempFile::new()?; + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); - writeln!(file1, r#"{{"count": 0, "name": "test"}}"#)?; - writeln!(file2, r#"{{"count": 1, "name": "test"}}"#)?; - - let input_files = vec![file1.path(), file2.path()]; - - create_archive_from_files( - &input_files, - output_file.path(), - Some("test-source".to_string()), - None, - ) - .map_err(|_| "Failed to create archive")?; - - let content = std::fs::read_to_string(output_file.path())?; - let lines: Vec<&str> = content.lines().collect(); - - assert!(lines.len() >= 2); // At least header + comment + observe + change events + // Should have header + observe event + at least one change event + assert!(lines.len() >= 2); // First line should be header - let header: Header = serde_json::from_str(lines[0])?; + let header: Header = serde_json::from_str(lines[0]).unwrap(); assert_eq!(header.file_type, "@peoplesgrocers/json-archive"); assert_eq!(header.version, 1); assert_eq!(header.initial, json!({"count": 0, "name": "test"})); - - Ok(()) } #[test] - fn test_snapshot_interval() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; - let mut writer = - ArchiveWriter::new(temp_file.path(), Some(2)).map_err(|_| "Failed to create writer")?; + fn test_snapshot_written_at_interval() { + // When snapshot_interval is set, a snapshot should be written every N observations + let file1 = create_json_file(&json!({"count": 0})); + let file2 = create_json_file(&json!({"count": 1})); + let file3 = create_json_file(&json!({"count": 2})); + let output = NamedTempFile::new().unwrap(); - assert!(!writer.should_write_snapshot()); // No observations yet + let input_files = vec![ + file1.path().to_path_buf(), + file2.path().to_path_buf(), + file3.path().to_path_buf(), + ]; - let obs1 = Observation::new("obs-1".to_string(), Utc::now()); - writer - .write_observation(obs1) - .map_err(|_| "Failed to write observation")?; - assert!(!writer.should_write_snapshot()); // 1 observation, interval is 2 + { + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + let snapshot_interval = Some(2); - let obs2 = Observation::new("obs-2".to_string(), Utc::now()); - writer - .write_observation(obs2) - .map_err(|_| "Failed to write observation")?; - assert!(writer.should_write_snapshot()); // 2 observations, should snapshot + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + snapshot_interval, + ¤t_state, + file_path, + None, + ) + .unwrap(); + } + writer.flush().unwrap(); + } - Ok(()) + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); + + // Look for a snapshot event + let has_snapshot = lines.iter().any(|line| { + if let Ok(event) = serde_json::from_str::(line) { + matches!(event, Event::Snapshot { .. }) + } else { + false + } + }); + + assert!( + has_snapshot, + "Expected a snapshot event after 2 observations" + ); } #[test] diff --git a/src/atomic_file.rs b/src/atomic_file.rs index 8b060bc..ffee8ae 100644 --- a/src/atomic_file.rs +++ b/src/atomic_file.rs @@ -110,7 +110,10 @@ pub fn generate_temp_filename>(path: P) -> PathBuf { /// /// Returns diagnostics if any step of the operation fails. The function /// attempts automatic recovery by restoring the backup if the replacement fails. -pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Result<(), Vec> { +pub fn atomic_replace_file>( + original_path: P, + temp_path: P, +) -> Result<(), Vec> { let original = original_path.as_ref(); let temp = temp_path.as_ref(); @@ -119,12 +122,13 @@ pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Re if let Some(filename_str) = filename.to_str() { // Extract random suffix from temp filename if it follows our pattern let temp_filename = temp.file_name().and_then(|f| f.to_str()).unwrap_or(""); - let random_suffix = if temp_filename.starts_with('.') && temp_filename.contains(filename_str) { - // Extract suffix after the original filename - temp_filename.rsplit('.').next().unwrap_or("backup") - } else { - "backup" - }; + let random_suffix = + if temp_filename.starts_with('.') && temp_filename.contains(filename_str) { + // Extract suffix after the original filename + temp_filename.rsplit('.').next().unwrap_or("backup") + } else { + "backup" + }; let backup_filename = format!(".{}.{}.old", filename_str, random_suffix); if let Some(parent) = original.parent() { @@ -148,7 +152,7 @@ pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Re ) .with_advice( "Make sure you have write permission in this directory and sufficient disk space." - .to_string() + .to_string(), )]); } diff --git a/src/bin/pointer_errors_demo.rs b/src/bin/pointer_errors_demo.rs index 9f95c45..dc2c8cd 100644 --- a/src/bin/pointer_errors_demo.rs +++ b/src/bin/pointer_errors_demo.rs @@ -15,7 +15,8 @@ fn print_example(pointer_str: &str, value: &mut serde_json::Value) { } fn main() { - print!(r#" + print!( + r#" # JSON Pointer Diagnostics @@ -52,7 +53,8 @@ or submit a pull request. Key doesn't exist in the object. Shows available keys and suggests typos. -"#); +"# + ); print_example( "/user/emial", @@ -65,13 +67,15 @@ Key doesn't exist in the object. Shows available keys and suggests typos. }), ); - print!(r#" + print!( + r#" ## Type Mismatch Tried to index into a value that doesn't support it (e.g., `/domain` on a string, `/0` on a number). Shows the actual type. -"#); +"# + ); print_example( "/users/0/email/domain", @@ -82,12 +86,14 @@ Tried to index into a value that doesn't support it (e.g., `/domain` on a string }), ); - print!(r#" + print!( + r#" ## Array Index Out of Bounds Index past the end of the array. Shows the array length. -"#); +"# + ); print_example( "/items/5", @@ -96,12 +102,14 @@ Index past the end of the array. Shows the array length. }), ); - print!(r#" + print!( + r#" ## Array Index If you think you have an object but you're actually indexing into an array, you'll see this error. -"#); +"# + ); print_example( "/items/foo", @@ -110,13 +118,15 @@ If you think you have an object but you're actually indexing into an array, you' }), ); - print!(r#" + print!( + r#" ## Deep Path Failures For long paths, the underline shows which segment failed. The full path remains visible so you can see what you were trying to reach. -"#); +"# + ); print_example( "/data/users/0/profile/settings/theme", diff --git a/src/cmd/info.rs b/src/cmd/info.rs index 15b8502..093d189 100644 --- a/src/cmd/info.rs +++ b/src/cmd/info.rs @@ -21,7 +21,9 @@ use crate::flags; use chrono::{DateTime, Utc}; -use json_archive::{Diagnostic, DiagnosticCode, DiagnosticLevel, Event}; +use json_archive::archive_open::open_archive; +use json_archive::detection::CompressionFormat; +use json_archive::{read_events, Diagnostic, DiagnosticCode, DiagnosticLevel, Event}; use serde::Serialize; use std::path::Path; @@ -46,6 +48,7 @@ struct JsonObservation { #[derive(Serialize)] struct JsonInfoOutput { archive: String, + compression: String, created: String, file_size: u64, snapshot_count: usize, @@ -54,9 +57,9 @@ struct JsonInfoOutput { efficiency_percent: f64, } -pub fn run(flags: &flags::Info) -> Vec { +pub fn run(flags: &flags::Info) -> Result<(), Vec> { if !flags.file.exists() { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::PathNotFound, format!("I couldn't find the archive file: {}", flags.file.display()), @@ -65,12 +68,13 @@ pub fn run(flags: &flags::Info) -> Vec { "Make sure the file path is correct and the file exists. \ Check for typos in the filename." .to_string(), - )]; + )]); } - let (observations, snapshot_count) = match collect_observations(&flags.file) { - Ok((obs, count)) => (obs, count), - Err(diagnostics) => return diagnostics, + let (observations, snapshot_count, compression_format) = match collect_observations(&flags.file) + { + Ok((obs, count, format)) => (obs, count, format), + Err(diagnostics) => return Err(diagnostics), }; let file_size = match std::fs::metadata(&flags.file) { @@ -79,7 +83,10 @@ pub fn run(flags: &flags::Info) -> Vec { }; // Calculate total JSON size (sum of all observations + newline separators) - let total_json_size: u64 = observations.iter().map(|obs| obs.json_size as u64).sum::() + let total_json_size: u64 = observations + .iter() + .map(|obs| obs.json_size as u64) + .sum::() + (observations.len() as u64).saturating_sub(1); // Add newlines between observations let efficiency_percent = if total_json_size > 0 { @@ -96,6 +103,7 @@ pub fn run(flags: &flags::Info) -> Vec { if observations.is_empty() { let empty_output = JsonInfoOutput { archive: flags.file.display().to_string(), + compression: compression_format.to_string(), created: "".to_string(), file_size, snapshot_count, @@ -107,7 +115,7 @@ pub fn run(flags: &flags::Info) -> Vec { "{}", serde_json::to_string_pretty(&empty_output).unwrap_or_default() ); - return Vec::new(); + return Ok(()); } let json_observations: Vec = observations @@ -128,6 +136,7 @@ pub fn run(flags: &flags::Info) -> Vec { let json_output = JsonInfoOutput { archive: flags.file.display().to_string(), + compression: compression_format.to_string(), created: observations[0].created.to_rfc3339(), file_size, snapshot_count, @@ -143,10 +152,11 @@ pub fn run(flags: &flags::Info) -> Vec { } else { // Human-readable output mode println!("Archive: {}", flags.file.display()); + println!("Compression: {}", compression_format); if observations.is_empty() { println!("No observations found"); - return Vec::new(); + return Ok(()); } let first_timestamp = &observations[0].created; @@ -217,56 +227,26 @@ pub fn run(flags: &flags::Info) -> Vec { snapshot_text, comparison ); - println!( - "Data size: {}", - format_size(total_json_size) - ); + println!("Data size: {}", format_size(total_json_size)); // Add usage instructions println!(); println!("To get the JSON value at a specific observation:"); - println!(" json-archive state --index <#> {}", flags.file.display()); - println!( - " json-archive state --id {}", - flags.file.display() - ); - println!(); - println!("Examples:"); - println!( - " json-archive state --index 0 {} # Get initial state", - flags.file.display() - ); - println!( - " json-archive state --index 2 {} # Get state after observation 2", - flags.file.display() - ); + println!(" json-archive state --index <#> "); + println!(" json-archive state --id "); } - Vec::new() + Ok(()) } -fn collect_observations(file_path: &Path) -> Result<(Vec, usize), Vec> { - let reader = match json_archive::ArchiveReader::new(file_path, json_archive::ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file: {}", e), - )]); - } - }; +fn collect_observations( + file_path: &Path, +) -> Result<(Vec, usize, CompressionFormat), Vec> { + let opened = open_archive(file_path)?; + let compression_format = opened.format; - let (initial_state, mut event_iter) = match reader.events(file_path) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive file: {}", e), - )]); - } - }; + let (initial_state, mut event_iter) = + read_events(opened.reader, &file_path.display().to_string())?; // Check for fatal diagnostics from initial parsing if event_iter.diagnostics.has_fatal() { @@ -295,7 +275,11 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize // Iterate through events while let Some(event) = event_iter.next() { match event { - Event::Observe { observation_id, timestamp, change_count } => { + Event::Observe { + observation_id, + timestamp, + change_count, + } => { observations.push(ObservationInfo { id: observation_id, timestamp, @@ -316,7 +300,9 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize } } } - Event::Change { path, new_value, .. } => { + Event::Change { + path, new_value, .. + } => { let _ = json_archive::apply_change(&mut current_state, &path, new_value); // Update the JSON size of the last observation @@ -368,10 +354,9 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize } } - Ok((observations, snapshot_count)) + Ok((observations, snapshot_count, compression_format)) } - fn format_timestamp(dt: &DateTime) -> String { dt.format("%a %H:%M:%S %d-%b-%Y").to_string() } diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index e54432a..6ac4eb6 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -21,3 +21,4 @@ pub mod info; pub mod state; +pub mod write; diff --git a/src/cmd/state.rs b/src/cmd/state.rs index 92da1b0..5aebc6a 100644 --- a/src/cmd/state.rs +++ b/src/cmd/state.rs @@ -21,7 +21,11 @@ use crate::flags; use chrono::{DateTime, Utc}; -use json_archive::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, Diagnostic, DiagnosticCode, DiagnosticLevel, Event, ReadMode}; +use json_archive::archive_open::open_archive; +use json_archive::{ + apply_add, apply_change, apply_move, apply_remove, read_events, Diagnostic, DiagnosticCode, + DiagnosticLevel, Event, +}; use serde_json::Value; use std::path::Path; @@ -35,9 +39,9 @@ enum AccessMethod { Latest, } -pub fn run(flags: &flags::State) -> Vec { +pub fn run(flags: &flags::State) -> Result<(), Vec> { if !flags.file.exists() { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::PathNotFound, format!("I couldn't find the archive file: {}", flags.file.display()), @@ -46,34 +50,34 @@ pub fn run(flags: &flags::State) -> Vec { "Make sure the file path is correct and the file exists. \ Check for typos in the filename." .to_string(), - )]; + )]); } // Parse and validate flags - ensure only one access method is specified let access_method = match parse_access_method(flags) { Ok(method) => method, - Err(diagnostic) => return vec![diagnostic], + Err(diagnostic) => return Err(vec![diagnostic]), }; // Find and replay to the target observation let target_state = match find_and_replay_to_target(&flags.file, &access_method) { Ok(state) => state, - Err(diagnostics) => return diagnostics, + Err(diagnostics) => return Err(diagnostics), }; // Output the JSON state match serde_json::to_string_pretty(&target_state) { Ok(json) => println!("{}", json), Err(e) => { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::InvalidEventJson, format!("I couldn't serialize the state to JSON: {}", e), - )]; + )]); } } - Vec::new() + Ok(()) } fn parse_access_method(flags: &flags::State) -> Result { @@ -151,27 +155,10 @@ fn find_and_replay_to_target( file_path: &Path, access_method: &AccessMethod, ) -> Result> { - let reader = match ArchiveReader::new(file_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file: {}", e), - )]); - } - }; + let opened = open_archive(file_path)?; - let (initial_state, mut event_iter) = match reader.events(file_path) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive file: {}", e), - )]); - } - }; + let (initial_state, mut event_iter) = + read_events(opened.reader, &file_path.display().to_string())?; // Check for fatal diagnostics from initial parsing if event_iter.diagnostics.has_fatal() { @@ -193,7 +180,11 @@ fn find_and_replay_to_target( // Process events and track state at each observation while let Some(event) = event_iter.next() { match event { - Event::Observe { observation_id, timestamp, change_count: _ } => { + Event::Observe { + observation_id, + timestamp, + change_count: _, + } => { observations.push(ObservationWithEvents { id: observation_id, timestamp, @@ -210,7 +201,9 @@ fn find_and_replay_to_target( } } } - Event::Change { path, new_value, .. } => { + Event::Change { + path, new_value, .. + } => { let _ = apply_change(&mut current_state, &path, new_value); // Update the final state of the last observation diff --git a/src/cmd/write.rs b/src/cmd/write.rs new file mode 100644 index 0000000..d513c18 --- /dev/null +++ b/src/cmd/write.rs @@ -0,0 +1,316 @@ +use crate::flags; +use json_archive::archive_open::{check_compression_support, open_archive}; +use json_archive::archive_reader::{read_archive, ReadMode}; +use json_archive::archive_writer::{default_output_filename, write_observation}; +use json_archive::atomic_file::atomic_replace_file; +use json_archive::compression_writer::CompressionWriter; +use json_archive::detection::CompressionFormat; +use json_archive::write_strategy::{determine_strategy, WriteStrategy}; +use json_archive::{is_json_archive, Diagnostic, DiagnosticCode, DiagnosticLevel}; + +use serde_json::Value; +use std::fs::{File, OpenOptions}; +use std::io::{BufWriter, Write}; +use std::path::{Path, PathBuf}; + +pub fn run(flags: &flags::Write) -> Result<(), Vec> { + let (input_files, strategy) = parse_flags(flags)?; + + assert!(!input_files.is_empty()); + + match strategy { + WriteStrategy::Create { + output: (dest, dest_fmt), + } => { + check_compression_support(dest_fmt, &dest, "write")?; + + println!("Creating new archive: {}", dest.display()); + println!("Input files: {:?}", input_files); + + // Create the writer - on error, no file cleanup needed since create failed + let mut writer = CompressionWriter::create(&dest, dest_fmt)?; + + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + for file in input_files { + // TODO: On write error, we need to clean up the partially written file ourselves + current_state = write_observation( + &mut writer, + &mut observation_count, + flags.snapshot_interval, + ¤t_state, + &file, + flags.source.clone(), + )?; + } + + // Finalize compression and flush buffers. + // Note: finish() does not clean up the file on error - caller must + // remove the file themselves if this fails. + if let Err(diagnostics) = writer.finish() { + let _ = std::fs::remove_file(&dest); + return Err(diagnostics); + } + + println!("Archive created successfully: {}", dest.display()); + Ok(()) + } + WriteStrategy::Append { path } => { + let opened = open_archive(&path)?; + let read_result = read_archive( + opened.reader, + &path.display().to_string(), + ReadMode::AppendSeek, + )?; + + if read_result.diagnostics.has_fatal() { + return Err(read_result.diagnostics.into_diagnostics()); + } + + let mut current_state = read_result.final_state; + // observation_count starts at existing count + 1 (header counts as first observation) + let mut observation_count = read_result.observation_count + 1; + + // Note, we are reopening the same file for appending. So getting a new file descriptor + let mut writer = BufWriter::new(open_for_appending(&path)?); + + for filename in input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + flags.snapshot_interval, + ¤t_state, + &filename, + flags.source.clone(), + )?; + } + + writer.flush().map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't flush the output: {}", e), + ) + })?; + + Ok(()) + } + WriteStrategy::CopyOnWrite { + input: (src, fmt_src), + output: (dest, fmt_dest), + } => { + assert!(src != dest); + check_compression_support(fmt_src, &src, "read")?; + check_compression_support(fmt_dest, &dest, "write")?; + + copy_and_append( + &src, + &dest, + fmt_dest, + &input_files, + flags.snapshot_interval, + flags.source.clone(), + ) + } + WriteStrategy::AtomicSwap { + path, + compression: format, + temp_path, + } => { + assert!(path != temp_path); + check_compression_support(format, &path, "read")?; + + copy_and_append( + &path, + &temp_path, + format, + &input_files, + flags.snapshot_interval, + flags.source.clone(), + )?; + + atomic_replace_file(&path, &temp_path) + } + } +} + +fn open_for_appending(path: &Path) -> Result> { + let file: File = OpenOptions::new().append(true).open(&path).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't open the archive file for appending: {}", e), + ) + .with_advice("Make sure the archive file exists and you have write permission.".to_string()) + })?; + Ok(file) +} + +/// Copy an archive from source to destination, then append new observations. +/// +/// This handles decompression of the source and compression of the destination +/// transparently. On error, the destination file is removed. +fn copy_and_append( + src: &Path, + dest: &Path, + dest_fmt: CompressionFormat, + input_files: &[PathBuf], + snapshot_interval: Option, + source: Option, +) -> Result<(), Vec> { + assert!(src != dest); + let opened = open_archive(src)?; + let mut reader = opened.reader; + + // Create destination writer (handles compression) + let mut writer = CompressionWriter::create(dest, dest_fmt)?; + + // Copy all decompressed bytes to the new (possibly compressed) destination + std::io::copy(&mut reader, &mut writer).map_err(|e| { + let _ = std::fs::remove_file(dest); + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't copy the archive contents: {}", e), + ) + })?; + + // Read the archive to get final state for appending + let opened = open_archive(src)?; + let read_result = read_archive( + opened.reader, + &src.display().to_string(), + ReadMode::AppendSeek, + )?; + + if read_result.diagnostics.has_fatal() { + let _ = std::fs::remove_file(dest); + return Err(read_result.diagnostics.into_diagnostics()); + } + + let mut current_state = read_result.final_state; + let mut observation_count = read_result.observation_count + 1; + + // Append new observations + for filename in input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + snapshot_interval, + ¤t_state, + filename, + source.clone(), + )?; + } + + // Finalize compression and flush buffers + if let Err(diagnostics) = writer.finish() { + let _ = std::fs::remove_file(dest); + return Err(diagnostics); + } + + Ok(()) +} + +/// Parse the CLI arguments to determine the destination archive and input files. +/// This consolidates all the inferring behavior in one place. +fn parse_flags(flags: &flags::Write) -> Result<(Vec, WriteStrategy), Vec> { + let mut diagnostics = Vec::new(); + if flags.inputs.is_empty() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::MissingHeaderField, + "I need at least one JSON file to create an archive, but you didn't provide any." + .to_string(), + ) + .with_advice( + "Usage: json-archive [file2.json ...]\n\n\ + The first file will be used as the initial state, and subsequent files \ + will be compared to generate change events." + .to_string(), + ), + ); + return Err(diagnostics); + } + + // I figured it would be a helpful bit of automation on behalf of the human + // user for this tool to validate all input files exist + for file in flags.inputs.iter() { + if !file.exists() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::PathNotFound, + format!("I couldn't find the input file: {}", file.display()), + ) + .with_advice( + "Make sure the file path is correct and the file exists. \ + Check for typos in the filename." + .to_string(), + ), + ); + } + } + + let source_archive: Option = if Path::new(&flags.inputs[0]).exists() + && is_json_archive(&flags.inputs[0]).unwrap_or(false) + { + Some(flags.inputs[0].clone()) + } else { + None + }; + + // Determine the destination archive path + let destination = if let Some(output) = &flags.output { + // Explicitly specified output path + output.clone() + } else if source_archive.is_some() { + source_archive.clone().unwrap() + } else { + // Infer from first input + default_output_filename(&flags.inputs[0]) + }; + + // Filter out the destination from input files to avoid read-write conflicts + let input_files: Vec<_> = flags + .inputs + .iter() + .filter(|path| { + match ( + std::fs::canonicalize(path).ok(), + std::fs::canonicalize(&destination).ok(), + ) { + (Some(p), Some(d)) => p != d, + _ => true, // Include if canonicalization fails (file doesn't exist yet) + } + }) + .cloned() + .collect(); + + if input_files.is_empty() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::MissingHeaderField, + "No input files remain after filtering out the destination archive.".to_string() + ) + .with_advice( + "You specified the output path in the list of input files. This would cause a read-write conflict.\n\ + Either remove the output path from inputs, or use a different output path with -o." + .to_string() + ) + ); + return Err(diagnostics); + } + + if !diagnostics.is_empty() { + return Err(diagnostics); + } + + Ok(( + input_files, + determine_strategy( + source_archive.as_deref(), + &destination, + CompressionFormat::None, + ), + )) +} diff --git a/src/compression_writer.rs b/src/compression_writer.rs new file mode 100644 index 0000000..cfac557 --- /dev/null +++ b/src/compression_writer.rs @@ -0,0 +1,431 @@ +// json-archive is a tool for tracking JSON file changes over time +// Copyright (C) 2025 Peoples Grocers LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// +// To purchase a license under different terms contact admin@peoplesgrocers.com +// To request changes, report bugs, or give user feedback contact +// marxism@peoplesgrocers.com +// + +//! Unified writer abstraction for compressed and uncompressed output. +//! +//! This module provides `CompressionWriter`, an enum that wraps different +//! compression encoders behind a common interface implementing `std::io::Write`. +//! +//! The goal is to simplify write logic by allowing callers to write to any +//! compression format using the same API, with proper error handling that +//! produces user-friendly diagnostics. + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use crate::detection::CompressionFormat; +use crate::diagnostics::{Diagnostic, DiagnosticCode}; + +/// A writer that handles optional compression transparently. +/// +/// Wraps different compression encoders behind a unified interface +/// that implements `Write` and provides a `finish()` method for cleanup. +/// +/// # Example +/// +/// ```ignore +/// use json_archive::compression_writer::CompressionWriter; +/// use json_archive::detection::CompressionFormat; +/// use std::io::Write; +/// +/// let mut writer = CompressionWriter::create(path, CompressionFormat::Gzip)?; +/// writeln!(writer, "some data")?; +/// writer.finish()?; +/// ``` +// Note: Cannot derive Debug because compression encoder types don't implement Debug +pub enum CompressionWriter { + /// Uncompressed output - uses BufWriter since File has no internal buffering + Plain(BufWriter), + /// Compression encoders write directly to File - they do their own internal buffering + #[cfg(feature = "compression")] + Gzip(flate2::write::GzEncoder), + #[cfg(feature = "compression")] + Zlib(flate2::write::ZlibEncoder), + #[cfg(feature = "compression")] + Zstd(zstd::stream::write::Encoder<'static, File>), + #[cfg(feature = "compression")] + Brotli(brotli::CompressorWriter), +} + +impl CompressionWriter { + /// Open a file for writing with the specified compression format. + /// + /// # Errors + /// + /// Returns a diagnostic explaining: + /// - What file we tried to create + /// - What compression format was requested + /// - Why it failed (permissions, disk full, unsupported format, etc.) + pub fn create(path: &Path, format: CompressionFormat) -> Result> { + let file = File::create(path).map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't create the output file '{}': {}", + path.display(), + describe_io_error(&e) + ), + ) + .with_advice(advice_for_create_error(&e, path))] + })?; + + match format { + // Plain needs BufWriter since File has no internal buffering + CompressionFormat::None => Ok(Self::Plain(BufWriter::new(file))), + + // Compression encoders do their own buffering, write directly to File + #[cfg(feature = "compression")] + CompressionFormat::Gzip => { + use flate2::write::GzEncoder; + use flate2::Compression; + Ok(Self::Gzip(GzEncoder::new(file, Compression::default()))) + } + + #[cfg(feature = "compression")] + CompressionFormat::Zlib => { + use flate2::write::ZlibEncoder; + use flate2::Compression; + Ok(Self::Zlib(ZlibEncoder::new(file, Compression::default()))) + } + + #[cfg(feature = "compression")] + CompressionFormat::Deflate => { + // Deflate is a raw compression algorithm, not a container format. + // We can read deflate data, but when writing we need to pick a + // container (gzip or zlib) that provides headers and checksums. + Err(vec![Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + "I can't write raw deflate format because it's not a container format.".to_string(), + ) + .with_advice( + "Deflate is a compression algorithm, not a file format. When writing, \ + you need to choose a container format that wraps deflate data:\n\ + \n - Use .gz (gzip) for general-purpose compression\n \ + - Use .zlib for zlib-wrapped deflate\n\ + \nIf you're appending to an existing deflate file, consider converting \ + it to gzip first.".to_string() + )]) + } + + #[cfg(feature = "compression")] + CompressionFormat::Zstd => { + let encoder = zstd::stream::write::Encoder::new(file, 0).map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't initialize zstd compression for '{}': {}", + path.display(), + e + ), + )] + })?; + Ok(Self::Zstd(encoder)) + } + + #[cfg(feature = "compression")] + CompressionFormat::Brotli => { + // buffer_size=4096, quality=11 (max), lgwin=22 (default window) + Ok(Self::Brotli(brotli::CompressorWriter::new( + file, 4096, 11, 22, + ))) + } + + #[cfg(not(feature = "compression"))] + _ => Err(vec![Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + format!( + "I can't write {} compressed files because this build doesn't include compression support.", + format_name(format) + ), + ) + .with_advice("Rebuild with: cargo build --features compression".to_string())]), + } + } + + /// Finish writing and flush all buffers. + /// + /// For compressed formats, this finalizes the compression stream. + /// Must be called before dropping to ensure all data is written. + /// + /// # Errors + /// + /// Returns a diagnostic if flushing or finalizing fails. + /// + /// **Important**: This method does not clean up the output file on error. + /// If `finish()` fails, the caller is responsible for removing the + /// partially-written file themselves: + /// + /// ```ignore + /// if let Err(diagnostics) = writer.finish() { + /// let _ = std::fs::remove_file(&path); + /// return Err(diagnostics); + /// } + /// ``` + pub fn finish(self) -> Result<(), Vec> { + match self { + Self::Plain(mut w) => w.flush().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't flush the output file: {}", + describe_io_error(&e) + ), + )] + }), + + #[cfg(feature = "compression")] + Self::Gzip(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize gzip compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Zlib(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize zlib compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Zstd(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize zstd compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Brotli(mut encoder) => { + // Brotli uses a different API - no finish() method + // Flush the encoder (brotli auto-flushes on drop, but we flush explicitly) + encoder.flush().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize brotli compression: {}", + describe_io_error(&e) + ), + )] + }) + } + } + } +} + +impl Write for CompressionWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + match self { + Self::Plain(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Gzip(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Zlib(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Zstd(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Brotli(w) => w.write(buf), + } + } + + fn flush(&mut self) -> std::io::Result<()> { + match self { + Self::Plain(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Gzip(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Zlib(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Zstd(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Brotli(w) => w.flush(), + } + } +} + +/// Translate io::Error into human-readable descriptions. +fn describe_io_error(e: &std::io::Error) -> String { + match e.kind() { + std::io::ErrorKind::NotFound => "the directory doesn't exist".to_string(), + std::io::ErrorKind::PermissionDenied => "permission denied".to_string(), + std::io::ErrorKind::AlreadyExists => { + "a directory with that name already exists".to_string() + } + std::io::ErrorKind::StorageFull => "the disk is full".to_string(), + std::io::ErrorKind::ReadOnlyFilesystem => "the filesystem is read-only".to_string(), + _ => e.to_string(), + } +} + +/// Generate helpful advice based on the error type. +fn advice_for_create_error(e: &std::io::Error, path: &Path) -> String { + match e.kind() { + std::io::ErrorKind::NotFound => { + if let Some(parent) = path.parent() { + format!( + "The parent directory '{}' doesn't exist. Create it first with:\n mkdir -p '{}'", + parent.display(), + parent.display() + ) + } else { + "Check that the path is valid.".to_string() + } + } + std::io::ErrorKind::PermissionDenied => { + format!( + "You don't have write permission for this location. Try:\n ls -la '{}'", + path.parent() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| ".".to_string()) + ) + } + std::io::ErrorKind::StorageFull => { + "Free up disk space or write to a different location.".to_string() + } + _ => "Check that the path is valid and you have write permission.".to_string(), + } +} + +/// Get a human-readable name for a compression format. +#[cfg(not(feature = "compression"))] +fn format_name(format: CompressionFormat) -> &'static str { + match format { + CompressionFormat::Gzip => "gzip", + CompressionFormat::Zlib => "zlib", + CompressionFormat::Zstd => "zstd", + CompressionFormat::Brotli => "brotli", + CompressionFormat::Deflate => "deflate", + CompressionFormat::None => "uncompressed", + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Read; + use tempfile::NamedTempFile; + + #[test] + fn test_plain_writer() -> Result<(), Box> { + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::None) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello world").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + let content = std::fs::read_to_string(path)?; + assert_eq!(content, "hello world\n"); + Ok(()) + } + + #[test] + #[cfg(feature = "compression")] + fn test_gzip_writer() -> Result<(), Box> { + use flate2::read::GzDecoder; + + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::Gzip) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello gzip").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + // Verify by decompressing + let file = File::open(path)?; + let mut decoder = GzDecoder::new(file); + let mut content = String::new(); + decoder.read_to_string(&mut content)?; + assert_eq!(content, "hello gzip\n"); + Ok(()) + } + + #[test] + #[cfg(feature = "compression")] + fn test_zstd_writer() -> Result<(), Box> { + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::Zstd) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello zstd").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + // Verify by decompressing + let file = File::open(path)?; + let mut decoder = zstd::stream::read::Decoder::new(file)?; + let mut content = String::new(); + decoder.read_to_string(&mut content)?; + assert_eq!(content, "hello zstd\n"); + Ok(()) + } + + #[test] + fn test_create_nonexistent_directory() { + let result = CompressionWriter::create( + Path::new("/nonexistent/directory/file.txt"), + CompressionFormat::None, + ); + match result { + Ok(_) => panic!("Expected error for nonexistent directory"), + Err(diagnostics) => { + assert_eq!(diagnostics.len(), 1); + // The error message should mention the path + assert!( + diagnostics[0] + .description + .contains("/nonexistent/directory/file.txt"), + "Expected path in error message, got: {}", + diagnostics[0].description + ); + } + } + } +} diff --git a/src/detection.rs b/src/detection.rs index c2d8de3..8a9f069 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -91,7 +91,10 @@ pub fn is_json_archive>(path: P) -> Result /// Create a buffered reader that handles decompression if needed. #[cfg(feature = "compression")] -fn create_reader(file: File, compression: CompressionFormat) -> Result, std::io::Error> { +fn create_reader( + file: File, + compression: CompressionFormat, +) -> Result, std::io::Error> { Ok(match compression { CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))), CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))), @@ -103,7 +106,10 @@ fn create_reader(file: File, compression: CompressionFormat) -> Result Result, std::io::Error> { +fn create_reader( + file: File, + compression: CompressionFormat, +) -> Result, std::io::Error> { if compression != CompressionFormat::None { // Without compression support, we can't decompress to check the header. // Return false by returning an empty reader that will fail header check. @@ -149,6 +155,19 @@ pub enum CompressionFormat { None, } +impl std::fmt::Display for CompressionFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CompressionFormat::Gzip => write!(f, "gzip"), + CompressionFormat::Deflate => write!(f, "deflate"), + CompressionFormat::Zlib => write!(f, "zlib"), + CompressionFormat::Brotli => write!(f, "brotli"), + CompressionFormat::Zstd => write!(f, "zstd"), + CompressionFormat::None => write!(f, "none"), + } + } +} + pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat { if bytes.len() < 4 { return CompressionFormat::None; @@ -160,12 +179,19 @@ pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat } // Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda - if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) { + if bytes[0] == 0x78 + && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) + { return CompressionFormat::Zlib; } // Zstd magic number: 0x28 0xb5 0x2f 0xfd - if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd { + if bytes.len() >= 4 + && bytes[0] == 0x28 + && bytes[1] == 0xb5 + && bytes[2] == 0x2f + && bytes[3] == 0xfd + { return CompressionFormat::Zstd; } diff --git a/src/diagnostics.rs b/src/diagnostics.rs index 42d7453..7a22be2 100644 --- a/src/diagnostics.rs +++ b/src/diagnostics.rs @@ -219,6 +219,12 @@ impl Diagnostic { } } +impl From for Vec { + fn from(diagnostic: Diagnostic) -> Self { + vec![diagnostic] + } +} + impl fmt::Display for Diagnostic { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let (Some(filename), Some(line)) = (&self.filename, self.line_number) { diff --git a/src/event_deserialize.rs b/src/event_deserialize.rs index c124e4a..8e33b2e 100644 --- a/src/event_deserialize.rs +++ b/src/event_deserialize.rs @@ -43,7 +43,7 @@ //! //! Spent 30 minutes looking for existing solutions. Checked: //! - serde_path_to_error: Adds field path context but still returns string errors -//! - figment: Configuration library, but sounded like could be used only for diagnostics +//! - figment: Configuration library, but sounded like could be used only for diagnostics //! - config/serde_value: Similar issue //! - json5: Relaxed JSON syntax, not diagnostic-focused //! - miette: a diagnostic library for Rust. It includes a series of @@ -63,10 +63,10 @@ //! diagnostics vec instead of returning errors. The calling code (reader.rs) attaches //! location information (filename, line number) after deserialization. +use chrono::{DateTime, Utc}; use serde::de::{Deserialize, Deserializer, SeqAccess, Visitor}; use serde_json::Value; use std::fmt; -use chrono::{DateTime, Utc}; use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticLevel}; use crate::events::Event; @@ -120,7 +120,7 @@ impl<'de> Visitor<'de> for EventVisitor { A: SeqAccess<'de>, { let mut elements: Vec = Vec::new(); - + while let Some(elem) = seq.next_element::()? { elements.push(elem); } @@ -140,7 +140,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the first element of an event to be a string event type.".to_string(), + "I expected the first element of an event to be a string event type." + .to_string(), ); return Ok(self.deserializer); } @@ -152,7 +153,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected an observe event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected an observe event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -176,7 +180,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the timestamp to be a valid ISO-8601 datetime string.".to_string(), + "I expected the timestamp to be a valid ISO-8601 datetime string." + .to_string(), ); return Ok(self.deserializer); } @@ -215,7 +220,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected an add event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected an add event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -258,7 +266,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a change event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a change event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -301,7 +312,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a remove event to have 3 fields, but found {}.", elements.len()), + format!( + "I expected a remove event to have 3 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -341,7 +355,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a move event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a move event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -394,7 +411,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a snapshot event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a snapshot event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -418,7 +438,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the timestamp to be a valid ISO-8601 datetime string.".to_string(), + "I expected the timestamp to be a valid ISO-8601 datetime string." + .to_string(), ); return Ok(self.deserializer); } @@ -476,14 +497,18 @@ impl EventVisitor { let from_idx = match pair[0].as_u64() { Some(i) => i as usize, None => { - return Err("I expected the 'from' index to be a non-negative integer.".to_string()); + return Err( + "I expected the 'from' index to be a non-negative integer.".to_string() + ); } }; let to_idx = match pair[1].as_u64() { Some(i) => i as usize, None => { - return Err("I expected the 'to' index to be a non-negative integer.".to_string()); + return Err( + "I expected the 'to' index to be a non-negative integer.".to_string() + ); } }; @@ -503,7 +528,7 @@ mod tests { fn test_deserialize_observe_event() { let json = json!(["observe", "obs-1", "2025-01-01T00:00:00Z", 1]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -518,7 +543,7 @@ mod tests { fn test_deserialize_add_event() { let json = json!(["add", "/count", 42, "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -533,11 +558,14 @@ mod tests { fn test_deserialize_invalid_event_type() { let json = json!(["invalid", "some", "data"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert_eq!(deserializer.diagnostics.len(), 1); - assert_eq!(deserializer.diagnostics[0].code, DiagnosticCode::UnknownEventType); + assert_eq!( + deserializer.diagnostics[0].code, + DiagnosticCode::UnknownEventType + ); assert!(deserializer.event.is_none()); } @@ -545,11 +573,14 @@ mod tests { fn test_deserialize_wrong_field_count() { let json = json!(["observe", "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert_eq!(deserializer.diagnostics.len(), 1); - assert_eq!(deserializer.diagnostics[0].code, DiagnosticCode::WrongFieldCount); + assert_eq!( + deserializer.diagnostics[0].code, + DiagnosticCode::WrongFieldCount + ); assert!(deserializer.event.is_none()); } @@ -557,7 +588,7 @@ mod tests { fn test_deserialize_move_event() { let json = json!(["move", "/items", [[0, 2], [1, 0]], "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -567,4 +598,4 @@ mod tests { if path == "/items" && moves == vec![(0, 2), (1, 0)] && observation_id == "obs-1" )); } -} \ No newline at end of file +} diff --git a/src/events.rs b/src/events.rs index da1ed45..3642c3d 100644 --- a/src/events.rs +++ b/src/events.rs @@ -89,7 +89,6 @@ pub enum Event { }, } - impl Serialize for Event { fn serialize(&self, serializer: S) -> Result where diff --git a/src/flags.rs b/src/flags.rs index 52382e3..d770c09 100644 --- a/src/flags.rs +++ b/src/flags.rs @@ -23,7 +23,7 @@ use std::path::PathBuf; xflags::xflags! { cmd json-archive { - default cmd create { + default cmd write { /// Input JSON files in chronological order. If first file is a .json.archive file, /// appends remaining files to it. Otherwise creates a new archive from all files. repeated inputs: PathBuf diff --git a/src/lib.rs b/src/lib.rs index 61234ec..a5b510e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,12 +19,11 @@ // marxism@peoplesgrocers.com // -pub mod archive_context; pub mod archive_open; -pub mod archive_ops; pub mod archive_reader; pub mod archive_writer; pub mod atomic_file; +pub mod compression_writer; pub mod detection; pub mod diagnostics; pub mod diff; @@ -33,12 +32,17 @@ pub mod events; pub mod flags; pub mod pointer; mod pointer_errors; +pub mod write_strategy; -pub use archive_writer::{ - append_to_archive, create_archive_from_files, default_output_filename, ArchiveBuilder, ArchiveWriter, +pub use archive_reader::{ + apply_add, apply_change, apply_move, apply_remove, read_archive, read_events, EventIterator, + ReadMode, ReadResult, }; +pub use archive_writer::{default_output_filename, write_observation, ArchiveWriter}; pub use detection::is_json_archive; pub use diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; pub use events::{Event, Header, Observation}; pub use pointer::JsonPointer; -pub use archive_reader::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, ReadMode, ReadResult}; +pub use write_strategy::{ + compression_from_extension, determine_strategy, CompressedPath, WriteStrategy, +}; diff --git a/src/main.rs b/src/main.rs index 50f6807..ecbcfc9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,9 +19,7 @@ // marxism@peoplesgrocers.com // -use json_archive::archive_ops::{append_to_archive, create_archive, default_output_filename}; -use json_archive::{is_json_archive, Diagnostic, DiagnosticCode, DiagnosticLevel}; -use std::path::Path; +use json_archive::Diagnostic; use std::process; mod cmd; @@ -30,169 +28,22 @@ mod flags; fn main() { let flags = flags::JsonArchive::from_env_or_exit(); - let diagnostics = run(flags); + if let Err(diagnostics) = run(flags) { + for diagnostic in &diagnostics { + eprintln!("{}", diagnostic); + } - for diagnostic in &diagnostics { - eprintln!("{}", diagnostic); - } - - let has_fatal = diagnostics.iter().any(|d| d.is_fatal()); - if has_fatal { - process::exit(1); + let has_fatal = diagnostics.iter().any(|d| d.is_fatal()); + if has_fatal { + process::exit(1); + } } } -fn run(flags: flags::JsonArchive) -> Vec { +fn run(flags: flags::JsonArchive) -> Result<(), Vec> { match flags.subcommand { - flags::JsonArchiveCmd::Create(create_flags) => run_create(&create_flags), + flags::JsonArchiveCmd::Write(write_flags) => cmd::write::run(&write_flags), flags::JsonArchiveCmd::Info(info_flags) => cmd::info::run(&info_flags), flags::JsonArchiveCmd::State(state_flags) => cmd::state::run(&state_flags), } } - -struct ParsedCreateArgs { - destination: std::path::PathBuf, - input_files: Vec, -} - -/// Parse the create command arguments to determine the destination archive and input files. -/// This consolidates all the inferring behavior in one place. -fn parse_create_args(flags: &flags::Create) -> Result> { - if flags.inputs.is_empty() { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::MissingHeaderField, - "I need at least one JSON file to create an archive, but you didn't provide any." - .to_string(), - ) - .with_advice( - "Usage: json-archive [file2.json ...]\n\n\ - The first file will be used as the initial state, and subsequent files \ - will be compared to generate change events." - .to_string(), - )]); - } - - // Determine the destination archive path - let destination = if let Some(output) = &flags.output { - // Explicitly specified output path - output.clone() - } else if Path::new(&flags.inputs[0]).exists() - && is_json_archive(&flags.inputs[0]).unwrap_or(false) - { - // First input is an existing archive - use it as destination - flags.inputs[0].clone() - } else { - // Infer from first input - default_output_filename(&flags.inputs[0]) - }; - - // Filter out the destination from input files to avoid read-write conflicts - let input_files: Vec<_> = flags.inputs - .iter() - .filter(|path| { - match (std::fs::canonicalize(path).ok(), std::fs::canonicalize(&destination).ok()) { - (Some(p), Some(d)) => p != d, - _ => true, // Include if canonicalization fails (file doesn't exist yet) - } - }) - .cloned() - .collect(); - - if input_files.is_empty() { - return Err(vec![ - Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::MissingHeaderField, - "No input files remain after filtering out the destination archive.".to_string() - ) - .with_advice( - "You specified the output path in the list of input files. This would cause a read-write conflict.\n\ - Either remove the output path from inputs, or use a different output path with -o." - .to_string() - ) - ]); - } - - // Validate all input files exist - let mut diagnostics = Vec::new(); - for input_path in &input_files { - if !Path::new(input_path).exists() { - diagnostics.push( - Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't find the input file: {}", input_path.display()), - ) - .with_advice( - "Make sure the file path is correct and the file exists. \ - Check for typos in the filename." - .to_string(), - ), - ); - } - } - - if !diagnostics.is_empty() { - return Err(diagnostics); - } - - Ok(ParsedCreateArgs { - destination, - input_files, - }) -} - -fn run_create(flags: &flags::Create) -> Vec { - let parsed = match parse_create_args(flags) { - Ok(parsed) => parsed, - Err(diagnostics) => return diagnostics, - }; - - if let Some(interval) = flags.snapshot_interval { - println!("Snapshot interval: every {} observations", interval); - } - - if let Some(ref source) = flags.source { - println!("Source: {}", source); - } - - // If destination exists and is an archive, append to it - if Path::new(&parsed.destination).exists() { - if let Ok(true) = is_json_archive(&parsed.destination) { - println!("Appending to existing archive: {}", parsed.destination.display()); - println!("Input files: {:?}", parsed.input_files); - - let diagnostics = append_to_archive( - &parsed.destination, - &parsed.input_files, - &parsed.destination, - flags.source.clone(), - flags.snapshot_interval, - ); - - if diagnostics.is_empty() { - println!("Archive updated successfully: {}", parsed.destination.display()); - } - - return diagnostics; - } - } - - // Otherwise create a new archive from the input files - println!("Creating new archive: {}", parsed.destination.display()); - println!("Input files: {:?}", parsed.input_files); - - let diagnostics = create_archive( - &parsed.input_files, - parsed.destination.clone(), - flags.source.clone(), - flags.snapshot_interval, - ); - - if diagnostics.is_empty() { - println!("Archive created successfully: {}", parsed.destination.display()); - } - - diagnostics -} diff --git a/src/pointer_errors.rs b/src/pointer_errors.rs index f447267..de7a1a1 100644 --- a/src/pointer_errors.rs +++ b/src/pointer_errors.rs @@ -390,7 +390,12 @@ mod tests { #[test] fn test_type_mismatch_error_output() { - let tokens = vec!["users".to_string(), "0".to_string(), "email".to_string(), "domain".to_string()]; + let tokens = vec![ + "users".to_string(), + "0".to_string(), + "email".to_string(), + "domain".to_string(), + ]; let current = Value::String("alice@example.com".to_string()); let diag = build_type_mismatch_error(&tokens, 3, "domain", ¤t); diff --git a/src/write_strategy.rs b/src/write_strategy.rs new file mode 100644 index 0000000..57b466d --- /dev/null +++ b/src/write_strategy.rs @@ -0,0 +1,352 @@ +// json-archive is a tool for tracking JSON file changes over time +// Copyright (C) 2025 Peoples Grocers LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// +// To purchase a license under different terms contact admin@peoplesgrocers.com +// To request changes, report bugs, or give user feedback contact +// marxism@peoplesgrocers.com +// + +//! Write strategy for archive operations. +//! +//! There are exactly two questions: +//! 1. Where do we write? (dest_path) +//! 2. Can we write there directly, or do we need to dance? +//! +//! The dance (temp file + atomic swap) is required when: +//! - source_path == dest_path, AND +//! - the file is compressed +//! +//! Why? Compressed streams don't support append. To add one +//! record to a gzip file, you decompress everything, add the +//! record, recompress everything. If you write to the same +//! file you're reading, you corrupt it mid-operation. +//! +//! So: write to temp, swap when done. See atomic_file.rs. +//! +//! When source != dest, there is no conflict. Read from source, +//! write to dest. Even if source is compressed. Even if dest +//! is compressed. Even if they use different compression. +//! The source is never modified. +//! +//! When source == dest AND uncompressed, just append. Seek to +//! end, write new records. Simple. +//! +//! The output compression format is determined by dest_path's +//! extension, not the source's format. That's a separate concern. +//! +//! ## Truth Table +//! +//! ```text +//! INPUTS OUTPUT FLAG STRATEGY +//! ─────────────────────────────────────────────────────────────── +//! [A.json, B.json] (none) Create { out: A.json.archive, fmt: None } +//! [A.json, B.json] -o X.archive.gz Create { out: X.archive.gz, fmt: Gzip } +//! +//! [A.archive, B.json] (none) Append { path: A.archive } +//! [A.archive, B.json] -o X.archive Direct { in: A.archive/None, out: X.archive/None } +//! +//! [A.archive.gz, B.json] (none) AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx } +//! [A.archive.gz, B.json] -o A.archive.gz AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx } +//! [A.archive.gz, B.json] -o X.archive Direct { in: A.archive.gz/Gzip, out: X.archive/None } +//! [A.archive.gz, B.json] -o X.archive.br Direct { in: A.archive.gz/Gzip, out: X.archive.br/Brotli } +//! ``` +//! +//! The rule: +//! ```text +//! if creating new archive: +//! Create +//! else if source != dest: +//! Direct (read from source, write to dest, transcoding as needed) +//! else if source == dest AND uncompressed: +//! Append (seek to end, write) +//! else if source == dest AND compressed: +//! AtomicSwap (read all, write to temp, swap) +//! ``` + +use std::path::{Path, PathBuf}; + +use crate::atomic_file::generate_temp_filename; +use crate::detection::CompressionFormat; + +/// A path with its compression format. +pub type CompressedPath = (PathBuf, CompressionFormat); + +/// Describes how to write archive data based on input/output paths and compression. +#[derive(Debug, Clone)] +pub enum WriteStrategy { + /// Create a new archive from scratch. No existing archive to read. + Create { output: CompressedPath }, + + /// Append to an existing uncompressed archive in-place. + /// Just seek to end and write new records. + Append { path: PathBuf }, + + /// Read from one location, write to another. + /// Handles transcoding between compression formats. + CopyOnWrite { + input: CompressedPath, + output: CompressedPath, + }, + + /// Read compressed archive, write to temp, atomic swap. + /// Required when source == dest AND compressed. + AtomicSwap { + /// The archive path (both input and output) + path: PathBuf, + /// Compression format (same for input and output in this case) + compression: CompressionFormat, + /// Temp file to write to before swapping + temp_path: PathBuf, + }, +} + +/// Determine compression format from file extension. +/// +/// Returns `CompressionFormat::None` for uncompressed files. +pub fn compression_from_extension(path: &Path) -> CompressionFormat { + let s = path.to_string_lossy(); + if s.ends_with(".gz") { + CompressionFormat::Gzip + } else if s.ends_with(".br") { + CompressionFormat::Brotli + } else if s.ends_with(".zst") { + CompressionFormat::Zstd + } else if s.ends_with(".zlib") { + CompressionFormat::Zlib + } else { + CompressionFormat::None + } +} + +/// Determine write strategy from parsed arguments. +/// +/// # Arguments +/// +/// * `source_archive` - Path to existing archive if appending, None if creating new +/// * `dest_path` - Where to write the output +/// * `source_compression` - Compression format of source (from magic bytes). Pass +/// `CompressionFormat::None` if unknown or uncompressed. +/// +/// # Returns +/// +/// The appropriate `WriteStrategy` for this operation. +pub fn determine_strategy( + source_archive: Option<&Path>, + dest_path: &Path, + source_compression: CompressionFormat, +) -> WriteStrategy { + let dest_compression = compression_from_extension(dest_path); + + // No source archive? Creating new. + let Some(source) = source_archive else { + return WriteStrategy::Create { + output: (dest_path.to_path_buf(), dest_compression), + }; + }; + + // Check if source and dest are the same file + let same_file = match (source.canonicalize(), dest_path.canonicalize()) { + (Ok(s), Ok(d)) => s == d, + // dest doesn't exist yet, or other error - not same file + _ => false, + }; + + if !same_file { + // Different files: read from source, write to dest + let source_fmt = if source_compression == CompressionFormat::None { + compression_from_extension(source) + } else { + source_compression + }; + return WriteStrategy::CopyOnWrite { + input: (source.to_path_buf(), source_fmt), + output: (dest_path.to_path_buf(), dest_compression), + }; + } + + // Same file - check if compressed + let compression = if source_compression == CompressionFormat::None { + compression_from_extension(source) + } else { + source_compression + }; + + match compression { + CompressionFormat::None => { + // Uncompressed: can append in-place + WriteStrategy::Append { + path: dest_path.to_path_buf(), + } + } + fmt => { + // Compressed: need atomic swap + WriteStrategy::AtomicSwap { + path: dest_path.to_path_buf(), + compression: fmt, + temp_path: generate_temp_filename(dest_path), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write as IoWrite; + use tempfile::NamedTempFile; + + #[test] + fn test_compression_from_extension() { + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.gz")), + CompressionFormat::Gzip + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.br")), + CompressionFormat::Brotli + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.zst")), + CompressionFormat::Zstd + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.zlib")), + CompressionFormat::Zlib + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive")), + CompressionFormat::None + ); + assert_eq!( + compression_from_extension(Path::new("foo.json")), + CompressionFormat::None + ); + } + + #[test] + fn test_create_new_archive() { + let dest = Path::new("/tmp/new.json.archive"); + let strategy = determine_strategy(None, dest, CompressionFormat::None); + + match strategy { + WriteStrategy::Create { output } => { + assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive")); + assert_eq!(output.1, CompressionFormat::None); + } + _ => panic!("Expected Create strategy"), + } + } + + #[test] + fn test_create_new_compressed_archive() { + let dest = Path::new("/tmp/new.json.archive.gz"); + let strategy = determine_strategy(None, dest, CompressionFormat::None); + + match strategy { + WriteStrategy::Create { output } => { + assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive.gz")); + assert_eq!(output.1, CompressionFormat::Gzip); + } + _ => panic!("Expected Create strategy"), + } + } + + #[test] + fn test_append_uncompressed_same_file() -> Result<(), Box> { + let mut temp = NamedTempFile::with_suffix(".json.archive")?; + writeln!(temp, "test")?; + temp.flush()?; + + let path = temp.path(); + let strategy = determine_strategy(Some(path), path, CompressionFormat::None); + + match strategy { + WriteStrategy::Append { path: p } => { + assert_eq!(p, path); + } + _ => panic!("Expected Append strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_atomic_swap_compressed_same_file() -> Result<(), Box> { + let mut temp = NamedTempFile::with_suffix(".json.archive.gz")?; + writeln!(temp, "test")?; + temp.flush()?; + + let path = temp.path(); + let strategy = determine_strategy(Some(path), path, CompressionFormat::Gzip); + + match strategy { + WriteStrategy::AtomicSwap { + path: p, + compression, + temp_path, + } => { + assert_eq!(p, path); + assert_eq!(compression, CompressionFormat::Gzip); + assert!(temp_path.to_string_lossy().contains(".json.archive.gz")); + } + _ => panic!("Expected AtomicSwap strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_direct_different_files() -> Result<(), Box> { + let mut source = NamedTempFile::with_suffix(".json.archive")?; + writeln!(source, "test")?; + source.flush()?; + + let dest = Path::new("/tmp/different.json.archive"); + let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::None); + + match strategy { + WriteStrategy::CopyOnWrite { input, output } => { + assert_eq!(input.0, source.path()); + assert_eq!(input.1, CompressionFormat::None); + assert_eq!(output.0, PathBuf::from("/tmp/different.json.archive")); + assert_eq!(output.1, CompressionFormat::None); + } + _ => panic!("Expected Direct strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_direct_transcode_compression() -> Result<(), Box> { + let mut source = NamedTempFile::with_suffix(".json.archive.gz")?; + writeln!(source, "test")?; + source.flush()?; + + let dest = Path::new("/tmp/output.json.archive.br"); + let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::Gzip); + + match strategy { + WriteStrategy::CopyOnWrite { input, output } => { + assert_eq!(input.1, CompressionFormat::Gzip); + assert_eq!(output.1, CompressionFormat::Brotli); + } + _ => panic!("Expected Direct strategy, got {:?}", strategy), + } + + Ok(()) + } +} diff --git a/tests/compressed_archive_tests.rs b/tests/compressed_archive_tests.rs index b7ea8d9..babcb7d 100644 --- a/tests/compressed_archive_tests.rs +++ b/tests/compressed_archive_tests.rs @@ -1,64 +1,94 @@ // Integration tests for compressed archive functionality -use json_archive::{append_to_archive, ArchiveWriter, Header}; -use json_archive::{ArchiveReader, ReadMode}; -use serde_json::json; -use std::io::Write; +use json_archive::archive_open::open_archive; +use json_archive::write_observation; +use json_archive::{read_archive, ReadMode}; +use serde_json::{json, Value}; +use std::fs::File; +use std::io::{BufWriter, Write}; use tempfile::NamedTempFile; #[test] #[cfg(feature = "compression")] -fn test_append_to_compressed_archive_basic() -> Result<(), Box> { +fn test_append_to_compressed_archive_basic() { use flate2::write::GzEncoder; use flate2::Compression; - // Create initial archive - let archive_file = NamedTempFile::with_suffix(".json.archive")?; - let header = Header::new(json!({"count": 0}), Some("test".to_string())); + // Create initial archive with one state + let initial_state = create_json_file(&json!({"count": 0})); + let archive_file = NamedTempFile::with_suffix(".json.archive").unwrap(); + #[allow(unused_assignments)] { - let mut writer = ArchiveWriter::new(archive_file.path(), None) - .map_err(|e| format!("Failed to create writer: {:?}", e))?; - writer.write_header(&header) - .map_err(|e| format!("Failed to write header: {:?}", e))?; - writer.finish() - .map_err(|e| format!("Failed to finish: {:?}", e))?; + let file = File::create(archive_file.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + &initial_state.path().to_path_buf(), + Some("test".to_string()), + ) + .unwrap(); + + writer.flush().unwrap(); } + dump_file(archive_file.path(), "Uncompressed archive"); + // Compress it - let compressed_file = NamedTempFile::with_suffix(".json.archive.gz")?; + let compressed_file = NamedTempFile::with_suffix(".json.archive.gz").unwrap(); { - let input = std::fs::read(archive_file.path())?; + let input = std::fs::read(archive_file.path()).unwrap(); let mut encoder = GzEncoder::new( - compressed_file.as_file().try_clone()?, - Compression::default() + compressed_file.as_file().try_clone().unwrap(), + Compression::default(), ); - encoder.write_all(&input)?; - encoder.finish()?; + encoder.write_all(&input).unwrap(); + encoder.finish().unwrap(); } - // Create a new state file to append - let mut state_file = NamedTempFile::new()?; - writeln!(state_file, r#"{{"count": 1}}"#)?; - state_file.flush()?; + dump_file(compressed_file.path(), "Compressed archive"); - // Append to compressed archive - let diagnostics = append_to_archive( - compressed_file.path(), - &[state_file.path()], - compressed_file.path(), - None, - None, - ); + // Verify the compressed archive can be read + let opened = open_archive(compressed_file.path()).unwrap(); + let result = read_archive( + opened.reader, + &compressed_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); - // Should succeed with no diagnostics - assert!(diagnostics.is_empty(), "Got diagnostics: {:?}", diagnostics); + eprintln!("=== Reader result ==="); + eprintln!("final_state: {:?}", result.final_state); + eprintln!("observation_count: {}", result.observation_count); + eprintln!("diagnostics: {:?}", result.diagnostics); + eprintln!(); - // Verify the archive was updated (decompressed) - let reader = ArchiveReader::new(compressed_file.path(), ReadMode::FullValidation)?; - let result = reader.read(compressed_file.path())?; - assert_eq!(result.final_state, json!({"count": 1})); - assert_eq!(result.observation_count, 1); - - Ok(()) + assert_eq!(result.final_state, json!({"count": 0})); + assert_eq!(result.observation_count, 0); +} + +/// Helper to create a temp file with JSON content +fn create_json_file(content: &Value) -> NamedTempFile { + let mut file = NamedTempFile::new().expect("Failed to create temp file"); + writeln!(file, "{}", serde_json::to_string(content).unwrap()).unwrap(); + file +} + +/// Debug helper: print file contents as both hex and text +fn dump_file(path: &std::path::Path, label: &str) { + let bytes = std::fs::read(path).unwrap(); + eprintln!("=== {} ({} bytes) ===", label, bytes.len()); + eprintln!("Hex: {:02x?}", &bytes[..bytes.len().min(100)]); + if let Ok(text) = std::str::from_utf8(&bytes) { + eprintln!("Text:\n{}", &text[..text.len().min(500)]); + } else { + eprintln!("(not valid UTF-8)"); + } + eprintln!(); } diff --git a/tests/compression-integration/run_brotli_test.sh b/tests/compression-integration/run_brotli_test.sh index c29185a..81d1748 100755 --- a/tests/compression-integration/run_brotli_test.sh +++ b/tests/compression-integration/run_brotli_test.sh @@ -44,11 +44,11 @@ echo "Final archive info:" # Decompress for manual inspection echo "" echo "Decompressing for comparison..." -brotli -d -k "$OUT_DIR/test.json.archive.br" +brotli -d -k "$OUT_DIR/test.json.archive.br" -o "$OUT_DIR/test-decompressed.json.archive" echo "" echo "Decompressed archive info:" -"$BINARY" info "$OUT_DIR/test.json.archive" +"$BINARY" info "$OUT_DIR/test-decompressed.json.archive" echo "" echo "Files in $OUT_DIR:"