diff --git a/Cargo.lock b/Cargo.lock index eacf05e..a8cdb8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -234,7 +234,7 @@ dependencies = [ [[package]] name = "json-archive" -version = "0.99.0" +version = "0.99.1" dependencies = [ "arbitrary", "brotli", diff --git a/Cargo.toml b/Cargo.toml index 6d12d96..3139ed2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "json-archive" -version = "0.99.0" +version = "0.99.1" edition = "2021" authors = ["Karl ", "nobody "] +homepage = "https://peoplesgrocers.com/code/oss/json-archive" repository = "https://peoplesgrocers.com/code/oss/json-archive" license = "AGPL-3.0" description = "CLI tool for tracking JSON file changes over time using delta-based archives" diff --git a/fuzz/fuzz_targets/fuzz_mutations.rs b/fuzz/fuzz_targets/fuzz_mutations.rs index a633906..bf2ad7d 100644 --- a/fuzz/fuzz_targets/fuzz_mutations.rs +++ b/fuzz/fuzz_targets/fuzz_mutations.rs @@ -1,8 +1,8 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; fn create_archive_content(data: &[u8]) -> Vec { @@ -80,24 +80,25 @@ fn create_archive_content(data: &[u8]) -> Vec { fuzz_target!(|data: &[u8]| { let archive_content = create_archive_content(data); - + if let Ok(mut temp_file) = NamedTempFile::new() { if temp_file.write_all(&archive_content).is_ok() { // Test both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { - let result = reader.read(temp_file.path()); - + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); + let result = read_archive(reader, &temp_file.path().display().to_string(), mode); + // Should never panic, regardless of input malformation match result { Ok(read_result) => { // Basic invariants that should hold for any successful parse let _ = &read_result.final_state; let _ = &read_result.diagnostics; - + // Observation count should be reasonable assert!(read_result.observation_count < 100000); - + // If we have diagnostics, they should be well-formed for diagnostic in read_result.diagnostics.diagnostics() { assert!(!diagnostic.description.is_empty()); diff --git a/fuzz/fuzz_targets/fuzz_random_bytes.rs b/fuzz/fuzz_targets/fuzz_random_bytes.rs index 0d0895a..75bd96b 100644 --- a/fuzz/fuzz_targets/fuzz_random_bytes.rs +++ b/fuzz/fuzz_targets/fuzz_random_bytes.rs @@ -1,8 +1,8 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; fuzz_target!(|data: &[u8]| { @@ -11,10 +11,11 @@ fuzz_target!(|data: &[u8]| { if temp_file.write_all(data).is_ok() { // Try to read the file with both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); // The read operation should never panic, regardless of input // It should either succeed or return an error gracefully - let _ = reader.read(temp_file.path()); + let _ = read_archive(reader, &temp_file.path().display().to_string(), mode); } } } diff --git a/fuzz/fuzz_targets/fuzz_structured.rs b/fuzz/fuzz_targets/fuzz_structured.rs index ba85a58..e787b5d 100644 --- a/fuzz/fuzz_targets/fuzz_structured.rs +++ b/fuzz/fuzz_targets/fuzz_structured.rs @@ -2,8 +2,8 @@ use libfuzzer_sys::fuzz_target; use arbitrary::{Arbitrary, Unstructured}; -use json_archive::{ArchiveReader, ReadMode}; -use std::io::Write; +use json_archive::{read_archive, ReadMode}; +use std::io::{BufReader, Write}; use tempfile::NamedTempFile; use serde_json::{json, Value}; @@ -160,20 +160,21 @@ fuzz_target!(|data: &[u8]| { let mut u = Unstructured::new(data); if let Ok(archive) = FuzzArchive::arbitrary(&mut u) { let content = archive.generate_archive(); - + if let Ok(mut temp_file) = NamedTempFile::new() { if temp_file.write_all(content.as_bytes()).is_ok() { // Test both validation modes for mode in [ReadMode::FullValidation, ReadMode::AppendSeek] { - if let Ok(reader) = ArchiveReader::new(temp_file.path(), mode) { - let result = reader.read(temp_file.path()); - + if let Ok(file) = std::fs::File::open(temp_file.path()) { + let reader = BufReader::new(file); + let result = read_archive(reader, &temp_file.path().display().to_string(), mode); + // The operation should never panic // Verify that diagnostics are properly generated for invalid structures if let Ok(read_result) = result { // Basic sanity checks on the result assert!(read_result.observation_count < 10000); // Reasonable upper bound - + // If there are fatal diagnostics, final state should be reasonable if read_result.diagnostics.has_fatal() { // Should still have some state (at least initial or null) diff --git a/src/archive_context.rs b/src/archive_context.rs deleted file mode 100644 index 7faeefd..0000000 --- a/src/archive_context.rs +++ /dev/null @@ -1,595 +0,0 @@ -// json-archive is a tool for tracking JSON file changes over time -// Copyright (C) 2025 Peoples Grocers LLC -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published -// by the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . -// -// To purchase a license under different terms contact admin@peoplesgrocers.com -// To request changes, report bugs, or give user feedback contact -// marxism@peoplesgrocers.com -// - -//! Archive write context and shared observation writing logic. -//! -//! This module provides: -//! - `WriteContext`: A struct that holds the state needed to write observations -//! - `write_observations`: The shared logic for diffing JSON files and writing events -//! -//! The key insight is that both create and append operations share the same -//! core logic once they've set up their initial state and writer. - -use chrono::{DateTime, Utc}; -use serde_json::Value; -use std::io::Write; -use std::path::{Path, PathBuf}; -use uuid::Uuid; - -use crate::atomic_file::atomic_replace_file; -use crate::detection::CompressionFormat; -use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector}; -use crate::diff; -use crate::events::{Event, Observation}; - -/// Strategy for finishing the write operation. -#[derive(Debug, Clone)] -pub enum FinishStrategy { - /// Just flush the writer. Used for: - /// - Creating new archives - /// - Appending to uncompressed archives (same file) - FlushOnly, - - /// Atomic replace: swap temp file with original. Used for: - /// - Appending to compressed archives (rewrite strategy) - AtomicReplace { - temp_path: PathBuf, - output_path: PathBuf, - }, -} - -/// Context for writing observations to an archive. -/// -/// This struct is the result of the "setup phase" for both create and append -/// operations. Once you have a WriteContext, you can use `write_observations` -/// to add new states, then call `finish` to complete the operation. -pub struct WriteContext { - /// The writer to output JSON lines to. - pub writer: W, - - /// Current state of the archive (used for diffing). - pub current_state: Value, - - /// Number of observations already in the archive. - pub observation_count: usize, - - /// Optional interval for writing snapshots. - pub snapshot_interval: Option, - - /// How to finish the write operation. - pub finish_strategy: FinishStrategy, - - /// Diagnostics collected during setup (e.g., warnings from reading existing archive). - pub diagnostics: DiagnosticCollector, -} - -impl WriteContext { - /// Create a new write context. - pub fn new( - writer: W, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - ) -> Self { - Self { - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics: DiagnosticCollector::new(), - } - } - - /// Create a write context with existing diagnostics. - pub fn with_diagnostics( - writer: W, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - diagnostics: DiagnosticCollector, - ) -> Self { - Self { - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics, - } - } - - /// Write observations for a list of JSON files. - /// - /// For each file: - /// 1. Reads and parses the JSON - /// 2. Diffs against current state - /// 3. Writes observation events - /// 4. Optionally writes a snapshot if interval is reached - /// 5. Updates current state - /// - /// Returns the number of observations written. - pub fn write_observations>( - &mut self, - files: &[P], - ) -> Result> { - let mut observations_written = 0; - - for file_path in files.iter() { - let file_path = file_path.as_ref(); - - // Write comment marking which file we're processing - if let Err(e) = writeln!(self.writer, "# Processing file: {}", file_path.display()) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - - // Get file modification time for the observation timestamp - let file_mtime = get_file_mtime(file_path)?; - - // Read and parse new state - let content = std::fs::read_to_string(file_path).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the input file '{}': {}", file_path.display(), e), - )] - })?; - - let new_state: Value = serde_json::from_str(&content).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse '{}' as JSON: {}", file_path.display(), e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - // Generate diff and create observation - let observation_id = format!("obs-{}", Uuid::new_v4()); - let diff_events = diff::diff(&self.current_state, &new_state, "", &observation_id); - - // Skip if no changes - if diff_events.is_empty() { - continue; - } - - // Create and write observation - let mut observation = Observation::new(observation_id, file_mtime); - for event in diff_events { - observation.add_event(event); - } - - self.write_observation(observation)?; - observations_written += 1; - self.observation_count += 1; - - // Check if we should write a snapshot - if self.should_write_snapshot() { - self.write_snapshot(&new_state, file_mtime)?; - } - - // Update current state for next iteration - self.current_state = new_state; - } - - Ok(observations_written) - } - - /// Write a single observation's events to the output. - fn write_observation(&mut self, observation: Observation) -> Result<(), Vec> { - for event in observation.to_events() { - let event_json = serde_json::to_string(&event).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize an event to JSON: {}", e), - )] - })?; - - writeln!(self.writer, "{}", event_json).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - })?; - } - - Ok(()) - } - - /// Check if we should write a snapshot based on observation count. - fn should_write_snapshot(&self) -> bool { - if let Some(interval) = self.snapshot_interval { - self.observation_count > 0 && self.observation_count % interval == 0 - } else { - false - } - } - - /// Write a snapshot event. - fn write_snapshot(&mut self, state: &Value, timestamp: DateTime) -> Result<(), Vec> { - let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); - let snapshot = Event::Snapshot { - observation_id: snapshot_id, - timestamp, - object: state.clone(), - }; - - let snapshot_json = serde_json::to_string(&snapshot).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the snapshot to JSON: {}", e), - )] - })?; - - writeln!(self.writer, "{}", snapshot_json).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - })?; - - Ok(()) - } - - /// Finish the write operation. - /// - /// This flushes the writer and, for compressed append operations, - /// performs the atomic file replacement. - pub fn finish(mut self) -> Result> { - // Flush the writer - self.writer.flush().map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )] - })?; - - // Handle atomic replacement if needed - match self.finish_strategy { - FinishStrategy::FlushOnly => { - // Nothing more to do - } - FinishStrategy::AtomicReplace { temp_path, output_path } => { - atomic_replace_file(&output_path, &temp_path)?; - } - } - - Ok(self.diagnostics) - } -} - -/// Get the file modification time as a DateTime. -fn get_file_mtime>(path: P) -> Result, Vec> { - let path = path.as_ref(); - let metadata = std::fs::metadata(path).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get metadata for '{}': {}", path.display(), e), - )] - })?; - - let modified = metadata.modified().map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get modification time for '{}': {}", path.display(), e), - )] - })?; - - Ok(modified.into()) -} - -/// Encoder wrapper that provides a uniform interface for different compression formats. -/// -/// This enum wraps the various compression encoders so we can treat them uniformly -/// in the append-to-compressed-archive flow. -#[cfg(feature = "compression")] -pub enum CompressedWriter { - Gzip(flate2::write::GzEncoder), - Zlib(flate2::write::ZlibEncoder), - Zstd(zstd::stream::write::Encoder<'static, std::fs::File>), - Brotli(brotli::CompressorWriter), -} - -#[cfg(feature = "compression")] -impl Write for CompressedWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - match self { - CompressedWriter::Gzip(w) => w.write(buf), - CompressedWriter::Zlib(w) => w.write(buf), - CompressedWriter::Zstd(w) => w.write(buf), - CompressedWriter::Brotli(w) => w.write(buf), - } - } - - fn flush(&mut self) -> std::io::Result<()> { - match self { - CompressedWriter::Gzip(w) => w.flush(), - CompressedWriter::Zlib(w) => w.flush(), - CompressedWriter::Zstd(w) => w.flush(), - CompressedWriter::Brotli(w) => w.flush(), - } - } -} - -#[cfg(feature = "compression")] -impl CompressedWriter { - /// Create a new compressed writer for the given format and file. - pub fn new(format: CompressionFormat, file: std::fs::File) -> Result { - use flate2::Compression; - - match format { - CompressionFormat::Gzip => { - Ok(CompressedWriter::Gzip(flate2::write::GzEncoder::new(file, Compression::default()))) - } - CompressionFormat::Zlib => { - Ok(CompressedWriter::Zlib(flate2::write::ZlibEncoder::new(file, Compression::default()))) - } - CompressionFormat::Zstd => { - let encoder = zstd::stream::write::Encoder::new(file, 0).map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd encoder: {}", e), - ) - })?; - Ok(CompressedWriter::Zstd(encoder)) - } - CompressionFormat::Brotli => { - Ok(CompressedWriter::Brotli(brotli::CompressorWriter::new(file, 4096, 11, 22))) - } - CompressionFormat::Deflate => { - // Deflate is typically used within gzip/zlib, not standalone for files - Err(Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "Standalone deflate compression is not supported for writing.".to_string(), - )) - } - CompressionFormat::None => { - Err(Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "CompressedWriter::new called with CompressionFormat::None".to_string(), - )) - } - } - } - - /// Finish compression and return any errors. - /// - /// This must be called before the file is closed to ensure all - /// compressed data is flushed. - pub fn finish(self) -> Result<(), Diagnostic> { - match self { - CompressedWriter::Gzip(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish gzip compression: {}", e), - ) - })?; - } - CompressedWriter::Zlib(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish zlib compression: {}", e), - ) - })?; - } - CompressedWriter::Zstd(w) => { - w.finish().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish zstd compression: {}", e), - ) - })?; - } - CompressedWriter::Brotli(mut w) => { - // Brotli doesn't have a finish() method, flush is sufficient - w.flush().map_err(|e| { - Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush brotli compression: {}", e), - ) - })?; - } - } - Ok(()) - } -} - -/// A write context specifically for compressed output. -/// -/// This wraps WriteContext to handle the finish() call properly for -/// compressed writers, which need to call finish() on the encoder -/// before the atomic file swap. -#[cfg(feature = "compression")] -pub struct CompressedWriteContext { - /// The inner write context. - inner: WriteContext, -} - -#[cfg(feature = "compression")] -impl CompressedWriteContext { - /// Create a new compressed write context. - pub fn new( - writer: CompressedWriter, - current_state: Value, - observation_count: usize, - snapshot_interval: Option, - finish_strategy: FinishStrategy, - diagnostics: DiagnosticCollector, - ) -> Self { - Self { - inner: WriteContext::with_diagnostics( - writer, - current_state, - observation_count, - snapshot_interval, - finish_strategy, - diagnostics, - ), - } - } - - /// Write observations for a list of JSON files. - pub fn write_observations>( - &mut self, - files: &[P], - ) -> Result> { - self.inner.write_observations(files) - } - - /// Write raw bytes to the output (used for copying existing archive content). - pub fn write_raw(&mut self, bytes: &[u8]) -> Result<(), Vec> { - self.inner.writer.write_all(bytes).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )] - }) - } - - /// Finish the write operation. - /// - /// This finishes the compression encoder, then performs any atomic - /// file operations needed. - pub fn finish(self) -> Result> { - let finish_strategy = self.inner.finish_strategy.clone(); - let diagnostics = self.inner.diagnostics; - - // Finish compression first - self.inner.writer.finish().map_err(|d| vec![d])?; - - // Then handle atomic replacement if needed - match finish_strategy { - FinishStrategy::FlushOnly => { - // Nothing more to do - } - FinishStrategy::AtomicReplace { temp_path, output_path } => { - atomic_replace_file(&output_path, &temp_path)?; - } - } - - Ok(diagnostics) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_write_context_single_observation() { - let mut output = Vec::new(); - let initial_state = json!({"count": 0}); - - { - let mut ctx = WriteContext::new( - &mut output, - initial_state, - 0, - None, - FinishStrategy::FlushOnly, - ); - - // Create a temp file with new state - let mut temp_file = tempfile::NamedTempFile::new().unwrap(); - std::io::Write::write_all(&mut temp_file, br#"{"count": 1}"#).unwrap(); - temp_file.flush().unwrap(); - - let count = ctx.write_observations(&[temp_file.path()]).unwrap(); - assert_eq!(count, 1); - } - - let output_str = String::from_utf8(output).unwrap(); - assert!(output_str.contains("# Processing file:")); - assert!(output_str.contains("observe")); - assert!(output_str.contains("change")); - assert!(output_str.contains("/count")); - } - - #[test] - fn test_write_context_no_changes() { - let mut output = Vec::new(); - let initial_state = json!({"count": 0}); - - { - let mut ctx = WriteContext::new( - &mut output, - initial_state, - 0, - None, - FinishStrategy::FlushOnly, - ); - - // Create a temp file with same state - let mut temp_file = tempfile::NamedTempFile::new().unwrap(); - std::io::Write::write_all(&mut temp_file, br#"{"count": 0}"#).unwrap(); - temp_file.flush().unwrap(); - - let count = ctx.write_observations(&[temp_file.path()]).unwrap(); - assert_eq!(count, 0); - } - - let output_str = String::from_utf8(output).unwrap(); - // Should have comment but no events - assert!(output_str.contains("# Processing file:")); - assert!(!output_str.contains("observe")); - } - - #[test] - fn test_should_write_snapshot() { - let output: Vec = Vec::new(); - - // No interval set - let ctx: WriteContext> = WriteContext::new( - output.clone(), - json!({}), - 5, - None, - FinishStrategy::FlushOnly, - ); - assert!(!ctx.should_write_snapshot()); - - // Interval of 2, at observation 4 (multiple of 2) - let ctx: WriteContext> = WriteContext::new( - output.clone(), - json!({}), - 4, - Some(2), - FinishStrategy::FlushOnly, - ); - assert!(ctx.should_write_snapshot()); - - // Interval of 2, at observation 3 (not multiple of 2) - let ctx: WriteContext> = WriteContext::new( - output, - json!({}), - 3, - Some(2), - FinishStrategy::FlushOnly, - ); - assert!(!ctx.should_write_snapshot()); - } -} diff --git a/src/archive_open.rs b/src/archive_open.rs index 68f3ea5..4683969 100644 --- a/src/archive_open.rs +++ b/src/archive_open.rs @@ -135,10 +135,17 @@ pub fn open_archive>(path: P) -> Result Result<(), Diagnostic> { #[cfg(not(feature = "compression"))] if format != CompressionFormat::None { @@ -154,11 +161,12 @@ pub fn check_compression_support( return Err(Diagnostic::fatal( DiagnosticCode::UnsupportedVersion, format!( - "I detected a {}-compressed archive, but this build doesn't support compression.", - format_name + "I inferred that you wanted to {} a {}-compressed archive at:\n\n {}\n\n\ + However, this build does not include compression libraries.", + action, format_name, filename.display() ), ) - .with_location(filename.to_string(), 1) + .with_location(filename.display().to_string(), 1) .with_advice( "This binary was built without compression support to reduce binary size and dependencies.\n\ You have two options:\n\ @@ -175,7 +183,9 @@ pub fn check_compression_support( /// /// This opens the file, reads magic bytes, and returns the compression format. /// Useful when you need to know the format before deciding how to process the file. -pub fn detect_archive_compression>(path: P) -> Result { +pub fn detect_archive_compression>( + path: P, +) -> Result { let path = path.as_ref(); let filename = path.display().to_string(); @@ -208,7 +218,11 @@ mod tests { #[test] fn test_open_uncompressed_archive() { let mut temp_file = NamedTempFile::new().unwrap(); - writeln!(temp_file, r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#).unwrap(); + writeln!( + temp_file, + r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"# + ) + .unwrap(); temp_file.flush().unwrap(); let opened = open_archive(temp_file.path()).unwrap(); diff --git a/src/archive_ops.rs b/src/archive_ops.rs deleted file mode 100644 index c03521a..0000000 --- a/src/archive_ops.rs +++ /dev/null @@ -1,644 +0,0 @@ -// json-archive is a tool for tracking JSON file changes over time -// Copyright (C) 2025 Peoples Grocers LLC -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published -// by the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . -// -// To purchase a license under different terms contact admin@peoplesgrocers.com -// To request changes, report bugs, or give user feedback contact -// marxism@peoplesgrocers.com -// - -//! High-level archive operations: create and append. -//! -//! This module provides the top-level entry points for creating and appending -//! to archives. These functions handle all the setup (opening files, detecting -//! compression, reading existing state) and then delegate to the shared -//! `WriteContext` for the actual observation writing. -//! -//! ## Architecture -//! -//! ```text -//! ┌─────────────────┐ -//! │ archive_ops.rs │ -//! │ (this module) │ -//! └────────┬────────┘ -//! │ -//! ┌─────────────────┼─────────────────┐ -//! │ │ │ -//! ▼ ▼ ▼ -//! ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ -//! │ archive_open │ │archive_context│ │ archive_reader│ -//! │ (compression) │ │ (WriteContext)│ │ (parsing) │ -//! └───────────────┘ └───────────────┘ └───────────────┘ -//! ``` -//! -//! ## Operations -//! -//! - `create_archive`: Create a new archive from one or more JSON files -//! - `append_to_archive`: Add observations to an existing archive - -use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Read, Write}; -use std::path::{Path, PathBuf}; - -use serde_json::Value; - -use crate::archive_context::{FinishStrategy, WriteContext}; -use crate::archive_open::{check_compression_support, detect_archive_compression, open_archive}; -use crate::archive_reader::{ArchiveReader, ReadMode}; -use crate::atomic_file::generate_temp_filename; -use crate::detection::CompressionFormat; -use crate::diagnostics::{Diagnostic, DiagnosticCode}; -use crate::events::Header; - -#[cfg(feature = "compression")] -use crate::archive_context::{CompressedWriteContext, CompressedWriter}; - -/// Create a new archive from a list of JSON files. -/// -/// The first file becomes the initial state in the header. Each subsequent -/// file generates an observation with the diff from the previous state. -/// -/// # Arguments -/// -/// * `input_files` - List of JSON files to process (at least one required) -/// * `output_path` - Path for the new archive file -/// * `source` - Optional source identifier for the header -/// * `snapshot_interval` - Optional interval for writing snapshots -/// -/// # Returns -/// -/// Returns an empty Vec on success, or a Vec of diagnostics on error. -pub fn create_archive>( - input_files: &[P], - output_path: P, - source: Option, - snapshot_interval: Option, -) -> Vec { - if input_files.is_empty() { - return vec![Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - "I need at least one input file to create an archive.".to_string(), - )]; - } - - // Read and parse the first file to get initial state - let first_path = input_files[0].as_ref(); - let first_content = match std::fs::read_to_string(first_path) { - Ok(content) => content, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the first input file '{}': {}", first_path.display(), e), - )]; - } - }; - - let initial_state: Value = match serde_json::from_str(&first_content) { - Ok(state) => state, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse '{}' as JSON: {}", first_path.display(), e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())]; - } - }; - - // Create the output file - let output_path = output_path.as_ref(); - let file = match File::create(output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create the output file '{}': {}", output_path.display(), e), - ) - .with_advice( - "Make sure you have write permission in this directory and that the path is valid." - .to_string(), - )]; - } - }; - - let mut writer = BufWriter::new(file); - - // Write the header - let header = Header::new(initial_state.clone(), source); - let header_json = match serde_json::to_string(&header) { - Ok(json) => json, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the header to JSON: {}", e), - )]; - } - }; - - if let Err(e) = writeln!(writer, "{}", header_json) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output file: {}", e), - )]; - } - - // If there are more files, process them through WriteContext - if input_files.len() > 1 { - let mut ctx = WriteContext::new( - writer, - initial_state, - 0, - snapshot_interval, - FinishStrategy::FlushOnly, - ); - - // Process remaining files (skip the first one which is now the initial state) - let remaining_files: Vec<&Path> = input_files[1..].iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&remaining_files) { - return diagnostics; - } - - if let Err(diagnostics) = ctx.finish() { - return diagnostics; - } - } else { - // Just flush the header - if let Err(e) = writer.flush() { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )]; - } - } - - Vec::new() -} - -/// Append observations to an existing archive. -/// -/// This function handles both compressed and uncompressed archives: -/// - Uncompressed: Opens in append mode and writes new observations directly -/// - Compressed: Reads entire archive, writes to temp file, atomic swap -/// -/// # Arguments -/// -/// * `archive_path` - Path to the existing archive -/// * `new_files` - List of JSON files to add as observations -/// * `output_path` - Where to write the result (can be same as archive_path) -/// * `source` - Optional source identifier (not currently used for append) -/// * `snapshot_interval` - Optional interval for writing snapshots -/// -/// # Returns -/// -/// Returns an empty Vec on success, or a Vec of diagnostics on error. -pub fn append_to_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - _source: Option, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Detect compression format - let format = match detect_archive_compression(archive_path) { - Ok(f) => f, - Err(diag) => return vec![diag], - }; - - // Check if this build supports the detected compression - if let Err(diag) = check_compression_support(format, &archive_path.display().to_string()) { - return vec![diag]; - } - - if format == CompressionFormat::None { - append_to_uncompressed_archive(archive_path, new_files, output_path, snapshot_interval) - } else { - append_to_compressed_archive(archive_path, new_files, output_path, format, snapshot_interval) - } -} - -/// Append to an uncompressed archive. -/// -/// This reads the archive to get the final state, then opens the file -/// in append mode to add new observations. -fn append_to_uncompressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Read the existing archive to get final state - let reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive for reading: {}", e), - )]; - } - }; - - let read_result = match reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics in the archive - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive." - .to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // If output path is different from archive path, copy the archive first - if archive_path != output_path { - if let Err(e) = std::fs::copy(archive_path, output_path) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't copy the archive to the output location: {}", e), - )]; - } - } - - // Open file in append mode - let file = match OpenOptions::new().append(true).open(output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e), - ) - .with_advice( - "Make sure the archive file exists and you have write permission.".to_string(), - )]; - } - }; - - // Create write context and process files - let mut ctx = WriteContext::with_diagnostics( - file, - read_result.final_state, - read_result.observation_count, - snapshot_interval, - FinishStrategy::FlushOnly, - read_result.diagnostics, - ); - - let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&file_refs) { - return diagnostics; - } - - match ctx.finish() { - Ok(collector) => collector.into_diagnostics(), - Err(diagnostics) => diagnostics, - } -} - -/// Append to a compressed archive. -/// -/// This reads the entire archive (decompressing), writes everything to a -/// new compressed temp file with the new observations, then atomically -/// swaps the temp file with the original. -#[cfg(feature = "compression")] -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - format: CompressionFormat, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Step 1: Open and decompress the archive, reading all bytes - let opened = match open_archive(archive_path) { - Ok(o) => o, - Err(diag) => return vec![diag], - }; - - // Read all decompressed bytes into memory - let mut decompressed_bytes = Vec::new(); - let mut reader = opened.reader; - if let Err(e) = reader.read_to_end(&mut decompressed_bytes) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - - // Step 2: Parse the archive to get final state using AppendSeek mode - // We need to re-read from the decompressed bytes - let archive_reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create archive reader: {}", e), - )]; - } - }; - - let read_result = match archive_reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't parse the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive." - .to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // Step 3: Create temp file with same compression format - let temp_path = generate_temp_filename(output_path); - let temp_file = match File::create(&temp_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create temp file: {}", e), - )]; - } - }; - - // Create compressed writer - let compressed_writer = match CompressedWriter::new(format, temp_file) { - Ok(w) => w, - Err(diag) => { - let _ = std::fs::remove_file(&temp_path); - return vec![diag]; - } - }; - - // Step 4: Create write context and copy old data + write new observations - let mut ctx = CompressedWriteContext::new( - compressed_writer, - read_result.final_state, - read_result.observation_count, - snapshot_interval, - FinishStrategy::AtomicReplace { - temp_path: temp_path.clone(), - output_path: output_path.to_path_buf(), - }, - read_result.diagnostics, - ); - - // Write all old decompressed bytes first - if let Err(diagnostics) = ctx.write_raw(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - - // Write new observations - let file_refs: Vec<&Path> = new_files.iter().map(|p| p.as_ref()).collect(); - if let Err(diagnostics) = ctx.write_observations(&file_refs) { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - - // Finish (this handles compression finalization and atomic swap) - match ctx.finish() { - Ok(collector) => collector.into_diagnostics(), - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - diagnostics - } - } -} - -/// Stub for when compression feature is not enabled. -#[cfg(not(feature = "compression"))] -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - _new_files: &[Q], - _output_path: P, - format: CompressionFormat, - _snapshot_interval: Option, -) -> Vec { - let format_name = match format { - CompressionFormat::Gzip => "gzip", - CompressionFormat::Deflate => "deflate", - CompressionFormat::Zlib => "zlib", - CompressionFormat::Brotli => "brotli", - CompressionFormat::Zstd => "zstd", - CompressionFormat::None => unreachable!(), - }; - - vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!( - "I detected a {}-compressed archive, but this build doesn't support compression.", - format_name - ), - ) - .with_location(archive_path.as_ref().display().to_string(), 1) - .with_advice( - "This binary was built without compression support.\n\ - Install with compression: cargo install json-archive --features compression\n\ - Or decompress the file first." - .to_string(), - )] -} - -/// Generate default output filename from input filename. -/// -/// - `test.json` -> `test.json.archive` -/// - `test.txt` -> `test.txt.json.archive` -/// - `test` -> `test.json.archive` -/// - `test.json.archive` -> `test.json.archive` (unchanged) -pub fn default_output_filename>(input_path: P) -> PathBuf { - let path = input_path.as_ref(); - let mut output = path.to_path_buf(); - - // If it already ends with .json.archive, don't modify it - if let Some(filename) = path.file_name() { - if let Some(filename_str) = filename.to_str() { - if filename_str.ends_with(".json.archive") { - return output; - } - } - } - - // Add .json.archive extension - if let Some(extension) = path.extension() { - if extension == "json" { - // Replace .json with .json.archive - output.set_extension("json.archive"); - } else { - // Append .json.archive to whatever extension exists - let new_extension = format!("{}.json.archive", extension.to_string_lossy()); - output.set_extension(new_extension); - } - } else { - // No extension, just add .json.archive - output.set_extension("json.archive"); - } - - output -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - use std::io::Write as IoWrite; - use tempfile::NamedTempFile; - - #[test] - fn test_create_archive_single_file() -> Result<(), Box> { - // Create input file - let mut input_file = NamedTempFile::new()?; - writeln!(input_file, r#"{{"count": 0, "name": "test"}}"#)?; - input_file.flush()?; - - // Create output file - let output_file = NamedTempFile::new()?; - - let diagnostics = create_archive( - &[input_file.path()], - output_file.path(), - Some("test-source".to_string()), - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify the output - let content = std::fs::read_to_string(output_file.path())?; - let header: Header = serde_json::from_str(content.lines().next().unwrap())?; - assert_eq!(header.file_type, "@peoplesgrocers/json-archive"); - assert_eq!(header.version, 1); - assert_eq!(header.initial, json!({"count": 0, "name": "test"})); - - Ok(()) - } - - #[test] - fn test_create_archive_multiple_files() -> Result<(), Box> { - // Create input files - let mut file1 = NamedTempFile::new()?; - let mut file2 = NamedTempFile::new()?; - writeln!(file1, r#"{{"count": 0}}"#)?; - writeln!(file2, r#"{{"count": 1}}"#)?; - file1.flush()?; - file2.flush()?; - - let output_file = NamedTempFile::new()?; - - let diagnostics = create_archive( - &[file1.path(), file2.path()], - output_file.path(), - None, - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify output has header + observation events - let content = std::fs::read_to_string(output_file.path())?; - let lines: Vec<&str> = content.lines().collect(); - assert!(lines.len() >= 3); // header + comment + observe + change - - // First line should be header - let header: Header = serde_json::from_str(lines[0])?; - assert_eq!(header.initial, json!({"count": 0})); - - // Should contain observe and change events - assert!(content.contains("observe")); - assert!(content.contains("change")); - assert!(content.contains("/count")); - - Ok(()) - } - - #[test] - fn test_append_to_uncompressed_archive() -> Result<(), Box> { - // Create initial archive - let mut archive_file = NamedTempFile::new()?; - let header = Header::new(json!({"count": 0}), None); - writeln!(archive_file, "{}", serde_json::to_string(&header)?)?; - archive_file.flush()?; - - // Create file to append - let mut new_file = NamedTempFile::new()?; - writeln!(new_file, r#"{{"count": 1}}"#)?; - new_file.flush()?; - - let diagnostics = append_to_archive( - archive_file.path(), - &[new_file.path()], - archive_file.path(), - None, - None, - ); - - assert!(diagnostics.is_empty(), "Expected no errors: {:?}", diagnostics); - - // Verify the archive was updated - let content = std::fs::read_to_string(archive_file.path())?; - assert!(content.contains("observe")); - assert!(content.contains("change")); - assert!(content.contains("/count")); - - Ok(()) - } - - #[test] - fn test_default_output_filename() { - assert_eq!( - default_output_filename("test.json"), - PathBuf::from("test.json.archive") - ); - - assert_eq!( - default_output_filename("test.txt"), - PathBuf::from("test.txt.json.archive") - ); - - assert_eq!( - default_output_filename("test"), - PathBuf::from("test.json.archive") - ); - - assert_eq!( - default_output_filename("test.json.archive"), - PathBuf::from("test.json.archive") - ); - } -} diff --git a/src/archive_reader.rs b/src/archive_reader.rs index 7ea419f..d434b59 100644 --- a/src/archive_reader.rs +++ b/src/archive_reader.rs @@ -21,22 +21,12 @@ use serde_json::Value; use std::collections::HashSet; -use std::fs::File; -use std::io::{BufRead, BufReader, Read}; -use std::path::Path; +use std::io::BufRead; use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; use crate::event_deserialize::EventDeserializer; use crate::events::{Event, Header}; use crate::pointer::JsonPointer; -use crate::detection::{CompressionFormat, detect_compression_format}; - -#[cfg(feature = "compression")] -use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder}; -#[cfg(feature = "compression")] -use brotli::Decompressor; -#[cfg(feature = "compression")] -use zstd::stream::read::Decoder as ZstdDecoder; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ReadMode { @@ -44,11 +34,6 @@ pub enum ReadMode { AppendSeek, } -pub struct ArchiveReader { - mode: ReadMode, - filename: String, -} - #[derive(Debug)] pub struct ReadResult { pub header: Header, @@ -57,15 +42,15 @@ pub struct ReadResult { pub observation_count: usize, } -pub struct EventIterator { - reader: Box, +pub struct EventIterator { + reader: R, pub diagnostics: DiagnosticCollector, pub header: Header, filename: String, line_number: usize, } -impl Iterator for EventIterator { +impl Iterator for EventIterator { type Item = Event; fn next(&mut self) -> Option { @@ -86,7 +71,8 @@ impl Iterator for EventIterator { } // Try to parse as event - let event_deserializer = match serde_json::from_str::(&line) { + let event_deserializer = match serde_json::from_str::(&line) + { Ok(d) => d, Err(e) => { self.diagnostics.add( @@ -113,7 +99,7 @@ impl Iterator for EventIterator { self.diagnostics.add( diagnostic .with_location(self.filename.clone(), self.line_number) - .with_snippet(format!("{} | {}", self.line_number, line.trim())) + .with_snippet(format!("{} | {}", self.line_number, line.trim())), ); } @@ -146,431 +132,355 @@ impl Iterator for EventIterator { } } -impl ArchiveReader { - pub fn new>(path: P, mode: ReadMode) -> std::io::Result { - let filename = path.as_ref().display().to_string(); - Ok(Self { mode, filename }) - } +/// Parse header and create event iterator from any BufRead source. +pub fn read_events( + mut reader: R, + filename: &str, +) -> Result<(Value, EventIterator), Diagnostic> { + let mut header_line = String::new(); + let mut line_number = 0; - pub fn events>(&self, path: P) -> std::io::Result<(Value, EventIterator)> { - let path = path.as_ref(); - let mut file = File::open(path)?; + // Skip comment lines until we find the header + loop { + header_line.clear(); + line_number += 1; - // Detect compression format - let mut magic_bytes = [0u8; 4]; - let bytes_read = file.read(&mut magic_bytes)?; - let compression_format = detect_compression_format(path, &magic_bytes[..bytes_read]); - - // Re-open file to reset position - file = File::open(path)?; - - let mut diagnostics = DiagnosticCollector::new(); - - // Check if compression is detected but not supported - #[cfg(not(feature = "compression"))] - if compression_format != CompressionFormat::None { - let format_name = match compression_format { - CompressionFormat::Gzip => "gzip", - CompressionFormat::Deflate => "deflate", - CompressionFormat::Zlib => "zlib", - CompressionFormat::Brotli => "brotli", - CompressionFormat::Zstd => "zstd", - CompressionFormat::None => unreachable!(), - }; - - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("I detected a {}-compressed archive, but this build doesn't support compression.", format_name) - ) - .with_location(self.filename.clone(), 1) - .with_advice( - "This binary was built without compression support to reduce binary size and dependencies.\n\ - You have two options:\n\ - 1. Install the version with compression support: cargo install json-archive --features compression\n\ - 2. Manually decompress the file first, then use this tool on the uncompressed archive" - .to_string() - ) - ); - - // Return dummy values with fatal diagnostic - let iterator = EventIterator { - reader: Box::new(BufReader::new(std::io::empty())), - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); - } - - // Create appropriate reader based on compression format - #[cfg(feature = "compression")] - let reader: Box = match compression_format { - CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))), - CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))), - CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))), - CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))), - CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)), - CompressionFormat::None => Box::new(BufReader::new(file)), - }; - - #[cfg(not(feature = "compression"))] - let reader: Box = Box::new(BufReader::new(file)); - - let mut reader = reader; - let mut header_line = String::new(); - - let _bytes_read = match reader.read_line(&mut header_line) { + match reader.read_line(&mut header_line) { Ok(0) => { - // Empty file - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::EmptyFile, - "I found an empty file, but I need at least a header line.".to_string(), - ) - .with_location(self.filename.clone(), 1) - .with_advice( - "See the file format specification for header structure." - .to_string(), - ), - ); - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); + // Empty file or only comments + return Err(Diagnostic::fatal( + DiagnosticCode::EmptyFile, + "I found an empty file (or only comments), but I need at least a header line." + .to_string(), + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "See the file format specification for header structure.".to_string(), + )); } - Ok(n) => n, + Ok(_) => {} Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { // UTF-8 error - diagnostics.add( + return Err( Diagnostic::fatal( DiagnosticCode::InvalidUtf8, - "I found invalid UTF-8 bytes at line 1.".to_string() + format!("I found invalid UTF-8 bytes at line {}.", line_number) ) - .with_location(self.filename.clone(), 1) + .with_location(filename.to_string(), line_number) .with_advice( "The JSON Archive format requires UTF-8 encoding. Make sure the file \ was saved with UTF-8 encoding, not Latin-1, Windows-1252, or another encoding." .to_string() ) ); - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); } - Err(e) => return Err(e), - }; - - let header = match self.parse_header(&header_line, 1, &mut diagnostics) { - Some(h) => h, - None => { - let iterator = EventIterator { - reader, - diagnostics, - header: Header::new(Value::Null, None), - filename: self.filename.clone(), - line_number: 1, - }; - return Ok((Value::Null, iterator)); + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't read from the archive: {}", e), + ) + .with_location(filename.to_string(), line_number)); } }; - let iterator = EventIterator { - reader, - diagnostics, - header: header.clone(), - filename: self.filename.clone(), - line_number: 1, - }; - - Ok((header.initial, iterator)) + // Skip comment lines (lines starting with #) + let trimmed = header_line.trim_start(); + if !trimmed.starts_with('#') { + break; + } } - pub fn read>(&self, path: P) -> std::io::Result { - let (initial_value, mut event_iter) = self.events(&path)?; + let header = parse_header(filename, &header_line, line_number)?; - // Check for early fatal diagnostics (like compression not supported) - if event_iter.diagnostics.has_fatal() { - return Ok(ReadResult { - header: Header::new(Value::Null, None), - final_state: Value::Null, - diagnostics: event_iter.diagnostics, - observation_count: 0, - }); - } + let iterator = EventIterator { + reader, + diagnostics: DiagnosticCollector::new(), + header: header.clone(), + filename: filename.to_string(), + line_number, + }; - let header = Header::new(initial_value.clone(), None); - let mut state = initial_value; - let mut seen_observations: HashSet = HashSet::new(); - let mut current_observation: Option<(String, usize, usize)> = None; - let mut events_in_observation = 0; - let mut observation_count = 0; + Ok((header.initial, iterator)) +} - // Process events from iterator - while let Some(event) = event_iter.next() { - let line_number = event_iter.line_number; +/// Read all events and return final state. +pub fn read_archive( + reader: R, + filename: &str, + mode: ReadMode, +) -> Result { + let (initial_value, mut event_iter) = read_events(reader, filename)?; - match event { - Event::Observe { observation_id, timestamp: _, change_count } => { - if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { - if events_in_observation != *expected_count { - event_iter.diagnostics.add( - Diagnostic::new( - DiagnosticLevel::Warning, - DiagnosticCode::ChangeCountMismatch, - format!( - "The observe event at line {} declared {} changes, but I found {}.", - obs_line, expected_count, events_in_observation - ) - ) - .with_location(self.filename.clone(), *obs_line) - .with_advice( - "Make sure the change_count in the observe event matches the number of \ - add/change/remove/move events that follow it." - .to_string() - ) - ); - } - } + let header = Header::new(initial_value.clone(), None); + let mut state = initial_value; + let mut seen_observations: HashSet = HashSet::new(); + let mut current_observation: Option<(String, usize, usize)> = None; + let mut events_in_observation = 0; + let mut observation_count = 0; - if seen_observations.contains(&observation_id) { + // Process events from iterator + while let Some(event) = event_iter.next() { + let line_number = event_iter.line_number; + + match event { + Event::Observe { + observation_id, + timestamp: _, + change_count, + } => { + if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { + if events_in_observation != *expected_count { event_iter.diagnostics.add( Diagnostic::new( DiagnosticLevel::Warning, - DiagnosticCode::DuplicateObservationId, - format!("I found a duplicate observation ID: '{}'", observation_id), + DiagnosticCode::ChangeCountMismatch, + format!( + "The observe event at line {} declared {} changes, but I found {}.", + obs_line, expected_count, events_in_observation + ) ) - .with_location(self.filename.clone(), line_number) + .with_location(filename.to_string(), *obs_line) .with_advice( - "Each observation ID should be unique within the archive. \ - Consider using UUIDs or timestamps to ensure uniqueness." - .to_string(), - ), - ); - } - - seen_observations.insert(observation_id.clone()); - current_observation = Some((observation_id, line_number, change_count)); - events_in_observation = 0; - observation_count += 1; - } - - Event::Add { path, value, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - .with_advice( - "Each add/change/remove/move event must reference an observation ID from a preceding observe event." - .to_string() - ) - ); - continue; - } - - if let Err(diag) = apply_add(&mut state, &path, value) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Change { path, new_value, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_change(&mut state, &path, new_value) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Remove { path, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_remove(&mut state, &path) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Move { path, moves, observation_id } => { - events_in_observation += 1; - - if self.mode == ReadMode::FullValidation - && !seen_observations.contains(&observation_id) - { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::NonExistentObservationId, - format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) - ) - .with_location(self.filename.clone(), line_number) - ); - continue; - } - - if let Err(diag) = apply_move(&mut state, &path, moves) { - event_iter.diagnostics.add(diag.with_location(self.filename.clone(), line_number)); - continue; - } - } - - Event::Snapshot { observation_id: _, timestamp: _, object } => { - if self.mode == ReadMode::FullValidation && state != object { - event_iter.diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::SnapshotStateMismatch, - "I found a snapshot whose state doesn't match the replayed state up to this point.".to_string() - ) - .with_location(self.filename.clone(), line_number) - .with_advice( - "This could indicate corruption or that events were applied incorrectly. \ - The snapshot state should exactly match the result of replaying all events \ - from the initial state." + "Make sure the change_count in the observe event matches the number of \ + add/change/remove/move events that follow it." .to_string() ) ); } - - state = object; } - } - } - if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { - if events_in_observation != *expected_count { - event_iter.diagnostics.add( - Diagnostic::new( - DiagnosticLevel::Warning, - DiagnosticCode::ChangeCountMismatch, - format!( - "The observe event at line {} declared {} changes, but I found {}.", - obs_line, expected_count, events_in_observation - ), - ) - .with_location(self.filename.clone(), *obs_line), - ); - } - } - - Ok(ReadResult { - header, - final_state: state, - diagnostics: event_iter.diagnostics, - observation_count, - }) - } - - fn parse_header( - &self, - line: &str, - line_number: usize, - diagnostics: &mut DiagnosticCollector, - ) -> Option
{ - let value: Value = match serde_json::from_str(line) { - Ok(v) => v, - Err(e) => { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::MissingHeader, - format!("I couldn't parse the header as JSON: {}", e), - ) - .with_location(self.filename.clone(), line_number) - .with_snippet(format!("{} | {}", line_number, line)) - .with_advice( - "The first line must be a JSON object containing the archive header.\n\ - Required fields: type, version, created, initial" - .to_string(), - ), - ); - return None; - } - }; - - match serde_json::from_value::
(value.clone()) { - Ok(header) => { - if header.version != 1 { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("I found version {}, but I only support version 1.", header.version) + if seen_observations.contains(&observation_id) { + event_iter.diagnostics.add( + Diagnostic::new( + DiagnosticLevel::Warning, + DiagnosticCode::DuplicateObservationId, + format!("I found a duplicate observation ID: '{}'", observation_id), ) - .with_location(self.filename.clone(), line_number) + .with_location(filename.to_string(), line_number) .with_advice( - "This archive was created with a newer or older version of the format. \ - You may need to upgrade your tools or convert the archive." + "Each observation ID should be unique within the archive. \ + Consider using UUIDs or timestamps to ensure uniqueness." + .to_string(), + ), + ); + } + + seen_observations.insert(observation_id.clone()); + current_observation = Some((observation_id, line_number, change_count)); + events_in_observation = 0; + observation_count += 1; + } + + Event::Add { + path, + value, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "Each add/change/remove/move event must reference an observation ID from a preceding observe event." .to_string() ) ); - return None; + continue; } - Some(header) + if let Err(diag) = apply_add(&mut state, &path, value) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } } - Err(e) => { - diagnostics.add( - Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - format!("I couldn't parse the header: {}", e), - ) - .with_location(self.filename.clone(), line_number) - .with_snippet(format!("{} | {}", line_number, line)) - .with_advice( - "The header must contain:\n\ - - type: \"@peoplesgrocers/json-archive\"\n\ - - version: 1\n\ - - created: an ISO-8601 timestamp\n\ - - initial: the initial state object" - .to_string(), - ), - ); - None + + Event::Change { + path, + new_value, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_change(&mut state, &path, new_value) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Remove { + path, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_remove(&mut state, &path) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Move { + path, + moves, + observation_id, + } => { + events_in_observation += 1; + + if mode == ReadMode::FullValidation && !seen_observations.contains(&observation_id) + { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::NonExistentObservationId, + format!("I found a reference to observation '{}', but I haven't seen an observe event with that ID yet.", observation_id) + ) + .with_location(filename.to_string(), line_number) + ); + continue; + } + + if let Err(diag) = apply_move(&mut state, &path, moves) { + event_iter + .diagnostics + .add(diag.with_location(filename.to_string(), line_number)); + continue; + } + } + + Event::Snapshot { + observation_id: _, + timestamp: _, + object, + } => { + if mode == ReadMode::FullValidation && state != object { + event_iter.diagnostics.add( + Diagnostic::fatal( + DiagnosticCode::SnapshotStateMismatch, + "I found a snapshot whose state doesn't match the replayed state up to this point.".to_string() + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "This could indicate corruption or that events were applied incorrectly. \ + The snapshot state should exactly match the result of replaying all events \ + from the initial state." + .to_string() + ) + ); + } + + state = object; } } } + if let Some((_obs_id, obs_line, expected_count)) = ¤t_observation { + if events_in_observation != *expected_count { + event_iter.diagnostics.add( + Diagnostic::new( + DiagnosticLevel::Warning, + DiagnosticCode::ChangeCountMismatch, + format!( + "The observe event at line {} declared {} changes, but I found {}.", + obs_line, expected_count, events_in_observation + ), + ) + .with_location(filename.to_string(), *obs_line), + ); + } + } + + Ok(ReadResult { + header, + final_state: state, + diagnostics: event_iter.diagnostics, + observation_count, + }) +} + +fn parse_header(filename: &str, line: &str, line_number: usize) -> Result { + let value: Value = serde_json::from_str(line).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::MissingHeader, + format!("I couldn't parse the header as JSON: {}", e), + ) + .with_location(filename.to_string(), line_number) + .with_snippet(format!("{} | {}", line_number, line)) + .with_advice( + "The first line must be a JSON object containing the archive header.\n\ + Required fields: type, version, created, initial" + .to_string(), + ) + })?; + + let header = serde_json::from_value::
(value).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::MissingHeaderField, + format!("I couldn't parse the header: {}", e), + ) + .with_location(filename.to_string(), line_number) + .with_snippet(format!("{} | {}", line_number, line)) + .with_advice( + "The header must contain:\n\ + - type: \"@peoplesgrocers/json-archive\"\n\ + - version: 1\n\ + - created: an ISO-8601 timestamp\n\ + - initial: the initial state object" + .to_string(), + ) + })?; + + if header.version != 1 { + return Err(Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + format!( + "I found version {}, but I only support version 1.", + header.version + ), + ) + .with_location(filename.to_string(), line_number) + .with_advice( + "This archive was created with a newer or older version of the format. \ + You may need to upgrade your tools or convert the archive." + .to_string(), + )); + } + + Ok(header) } pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diagnostic> { @@ -578,7 +488,7 @@ pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diag diag.with_advice( "JSON Pointer paths must start with '/' and use '/' to separate segments.\n\ Special characters: use ~0 for ~ and ~1 for /" - .to_string() + .to_string(), ) })?; @@ -586,7 +496,7 @@ pub fn apply_add(state: &mut Value, path: &str, value: Value) -> Result<(), Diag diag.with_advice( "For add operations, the parent path must exist. \ For example, to add /a/b/c, the paths /a and /a/b must already exist." - .to_string() + .to_string(), ) }) } @@ -613,20 +523,18 @@ pub fn apply_move( let array = pointer.get_mut(state)?; if !array.is_array() { - return Err( - Diagnostic::fatal( - DiagnosticCode::MoveOnNonArray, - format!( - "I can't apply move operations to '{}' because it's not an array.", - path - ), - ) - .with_advice( - "Move operations can only reorder elements within an array. \ - The path must point to an array value." - .to_string(), + return Err(Diagnostic::fatal( + DiagnosticCode::MoveOnNonArray, + format!( + "I can't apply move operations to '{}' because it's not an array.", + path ), - ); + ) + .with_advice( + "Move operations can only reorder elements within an array. \ + The path must point to an array value." + .to_string(), + )); } let arr = array.as_array_mut().unwrap(); @@ -659,7 +567,11 @@ pub fn apply_move( // Apply moves now that we know they're all valid for (from_idx, to_idx) in moves { let element = arr.remove(from_idx); - let insert_idx = if to_idx > from_idx { to_idx - 1 } else { to_idx }; + let insert_idx = if to_idx > from_idx { + to_idx - 1 + } else { + to_idx + }; arr.insert(insert_idx, element); } @@ -670,91 +582,114 @@ pub fn apply_move( mod tests { use super::*; use serde_json::json; - use std::io::Write; + use std::fs::File; + use std::io::{BufReader, Write}; use tempfile::NamedTempFile; #[test] - fn test_read_valid_archive() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_read_valid_archive() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), Some("test".to_string())); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 1]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert_eq!(result.final_state, json!({"count": 1})); assert_eq!(result.observation_count, 1); assert!(!result.diagnostics.has_fatal()); - - Ok(()) } #[test] - fn test_empty_file() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; + fn test_empty_file() { + let temp_file = NamedTempFile::new().unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ); - assert!(result.diagnostics.has_fatal()); - assert_eq!(result.diagnostics.len(), 1); - - Ok(()) + assert!(result.is_err()); } #[test] - fn test_non_existent_observation_id() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_non_existent_observation_id() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert!(result.diagnostics.has_fatal()); - - Ok(()) } #[test] - fn test_append_mode_ignores_observation_id() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_append_mode_ignores_observation_id() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-999"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::AppendSeek)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::AppendSeek, + ) + .unwrap(); assert!(!result.diagnostics.has_fatal()); assert_eq!(result.final_state, json!({"count": 1})); - - Ok(()) } #[test] - fn test_change_count_mismatch() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_change_count_mismatch() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 0}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 2]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); let warnings: Vec<_> = result .diagnostics @@ -764,28 +699,31 @@ mod tests { .collect(); assert_eq!(warnings.len(), 1); - - Ok(()) } #[test] - fn test_simple_change() -> Result<(), Box> { - let mut temp_file = NamedTempFile::new()?; + fn test_simple_change() { + let mut temp_file = NamedTempFile::new().unwrap(); let header = Header::new(json!({"count": 5}), None); - writeln!(temp_file, "{}", serde_json::to_string(&header)?)?; + writeln!(temp_file, "{}", serde_json::to_string(&header).unwrap()).unwrap(); writeln!( temp_file, r#"["observe", "obs-1", "2025-01-01T00:00:00Z", 1]"# - )?; - writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#)?; + ) + .unwrap(); + writeln!(temp_file, r#"["change", "/count", 1, "obs-1"]"#).unwrap(); - let reader = ArchiveReader::new(temp_file.path(), ReadMode::FullValidation)?; - let result = reader.read(temp_file.path())?; + let file = File::open(temp_file.path()).unwrap(); + let reader = BufReader::new(file); + let result = read_archive( + reader, + &temp_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); assert!(!result.diagnostics.has_fatal()); assert_eq!(result.final_state, json!({"count": 1})); - - Ok(()) } } diff --git a/src/archive_writer.rs b/src/archive_writer.rs index 531809b..63b807d 100644 --- a/src/archive_writer.rs +++ b/src/archive_writer.rs @@ -19,112 +19,51 @@ // marxism@peoplesgrocers.com // -use chrono::{Utc, DateTime}; +use chrono::{DateTime, Utc}; use serde_json::Value; -use std::fs::{File, OpenOptions}; -use std::io::{BufWriter, Write, Read, Seek, SeekFrom}; +use std::io::Write; use std::path::{Path, PathBuf}; use uuid::Uuid; -use crate::atomic_file::{atomic_replace_file, generate_temp_filename}; use crate::diagnostics::{Diagnostic, DiagnosticCode}; use crate::diff; use crate::events::{Event, Header, Observation}; -use crate::archive_reader::{ArchiveReader, ReadMode}; -use crate::detection::{CompressionFormat, detect_compression_format}; pub struct ArchiveWriter { - writer: BufWriter, observation_count: usize, - snapshot_interval: Option, filename: String, } impl ArchiveWriter { - pub fn new>( - path: P, - snapshot_interval: Option, - ) -> Result> { - let filename = path.as_ref().display().to_string(); - let file = match File::create(&path) { - Ok(f) => f, - Err(e) => { - let diagnostic = Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create the output file: {}", e) - ) - .with_advice( - "Make sure you have write permission in this directory and that the path is valid." - .to_string() - ); - return Err(vec![diagnostic]); - } - }; - let writer = BufWriter::new(file); - - Ok(Self { - writer, - observation_count: 0, - snapshot_interval, - filename, - }) - } - - pub fn new_append>( - path: P, - snapshot_interval: Option, - current_observation_count: usize, - ) -> Result> { - let filename = path.as_ref().display().to_string(); - let file = match OpenOptions::new().append(true).open(&path) { - Ok(f) => f, - Err(e) => { - let diagnostic = Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e) - ) - .with_advice( - "Make sure the archive file exists and you have write permission." - .to_string() - ); - return Err(vec![diagnostic]); - } - }; - let writer = BufWriter::new(file); - - Ok(Self { - writer, - observation_count: current_observation_count, - snapshot_interval, - filename, - }) - } - - pub fn write_header(&mut self, header: &Header) -> Result<(), Vec> { + pub fn write_header(&self, writer: &mut impl Write, header: &Header) -> Result<(), Diagnostic> { let header_json = match serde_json::to_string(header) { Ok(json) => json, Err(e) => { - return Err(vec![Diagnostic::fatal( + return Err(Diagnostic::fatal( DiagnosticCode::InvalidEventJson, format!("I couldn't serialize the header to JSON: {}", e), ) - .with_location(self.filename.clone(), 1)]); + .with_location(self.filename.clone(), 1)); } }; - if let Err(e) = writeln!(self.writer, "{}", header_json) { - return Err(vec![Diagnostic::fatal( + if let Err(e) = writeln!(writer, "{}", header_json) { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), ) - .with_location(self.filename.clone(), 1)]); + .with_location(self.filename.clone(), 1)); } Ok(()) } - pub fn write_comment(&mut self, comment: &str) -> Result<(), Vec> { - if let Err(e) = writeln!(self.writer, "# {}", comment) { + pub fn write_comment( + &self, + writer: &mut impl Write, + comment: &str, + ) -> Result<(), Vec> { + if let Err(e) = writeln!(writer, "# {}", comment) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -133,7 +72,11 @@ impl ArchiveWriter { Ok(()) } - pub fn write_observation(&mut self, observation: Observation) -> Result<(), Vec> { + pub fn write_observation( + &mut self, + writer: &mut impl Write, + observation: Observation, + ) -> Result<(), Vec> { let events = observation.to_events(); for event in events { @@ -147,7 +90,7 @@ impl ArchiveWriter { } }; - if let Err(e) = writeln!(self.writer, "{}", event_json) { + if let Err(e) = writeln!(writer, "{}", event_json) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -159,7 +102,11 @@ impl ArchiveWriter { Ok(()) } - pub fn write_snapshot(&mut self, object: &Value) -> Result<(), Vec> { + pub fn write_snapshot( + &self, + writer: &mut impl Write, + object: &Value, + ) -> Result<(), Vec> { let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); let snapshot = Event::Snapshot { observation_id: snapshot_id, @@ -177,7 +124,7 @@ impl ArchiveWriter { } }; - if let Err(e) = writeln!(self.writer, "{}", event_json) { + if let Err(e) = writeln!(writer, "{}", event_json) { return Err(vec![Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't write to the output file: {}", e), @@ -186,94 +133,6 @@ impl ArchiveWriter { Ok(()) } - - pub fn should_write_snapshot(&self) -> bool { - if let Some(interval) = self.snapshot_interval { - self.observation_count > 0 && self.observation_count % interval == 0 - } else { - false - } - } - - pub fn finish(mut self) -> Result<(), Vec> { - if let Err(e) = self.writer.flush() { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush the output file: {}", e), - )]); - } - Ok(()) - } -} - -pub struct ArchiveBuilder { - initial_state: Option, - current_state: Value, - source: Option, - snapshot_interval: Option, -} - -impl ArchiveBuilder { - pub fn new() -> Self { - Self { - initial_state: None, - current_state: Value::Null, - source: None, - snapshot_interval: None, - } - } - - pub fn with_source(mut self, source: String) -> Self { - self.source = Some(source); - self - } - - pub fn with_snapshot_interval(mut self, interval: usize) -> Self { - self.snapshot_interval = Some(interval); - self - } - - pub fn add_state(&mut self, state: Value) -> Option { - if self.initial_state.is_none() { - self.initial_state = Some(state.clone()); - self.current_state = state; - return None; - } - - let observation_id = format!("obs-{}", Uuid::new_v4()); - let timestamp = Utc::now(); - - let diff_result: Vec = diff::diff(&self.current_state, &state, "", &observation_id); - self.current_state = state; - - let mut observation = Observation::new(observation_id, timestamp); - for event in diff_result { - observation.add_event(event); - } - - Some(observation) - } - - pub fn build>(self, output_path: P) -> Result<(), Vec> { - if self.initial_state.is_none() { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::MissingHeaderField, - "I can't build an archive without any initial state.".to_string(), - )]); - } - - let header = Header::new(self.initial_state.unwrap(), self.source); - - let mut writer = ArchiveWriter::new(output_path, self.snapshot_interval)?; - writer.write_header(&header)?; - writer.finish()?; - - Ok(()) - } - - pub fn get_initial_state(&self) -> Option<&Value> { - self.initial_state.as_ref() - } } /// Generate default output filename from input filename @@ -290,36 +149,20 @@ pub fn default_output_filename>(input_path: P) -> PathBuf { } } - // Add .json.archive extension if let Some(extension) = path.extension() { if extension == "json" { - // Replace .json with .json.archive output.set_extension("json.archive"); } else { - // Append .json.archive to whatever extension exists let new_extension = format!("{}.json.archive", extension.to_string_lossy()); output.set_extension(new_extension); } } else { - // No extension, just add .json.archive output.set_extension("json.archive"); } output } -/// Detect if a file is compressed by checking magic bytes -/// Uses the existing compression detection from reader.rs -fn is_compressed>(path: P) -> std::io::Result { - let path = path.as_ref(); - let mut file = File::open(path)?; - let mut magic_bytes = [0u8; 4]; - let bytes_read = file.read(&mut magic_bytes)?; - - let format = detect_compression_format(path, &magic_bytes[..bytes_read]); - Ok(format != CompressionFormat::None) -} - /// Get the file modification time as a DateTime fn get_file_mtime>(path: P) -> std::io::Result> { let metadata = std::fs::metadata(path)?; @@ -343,676 +186,268 @@ fn get_file_mtime>(path: P) -> std::io::Result> { /// # Returns /// /// Returns the number of observations written -fn write_observations_to_writer>( +pub fn write_observation>( writer: &mut W, - current_state: Value, - new_files: &[P], - mut observation_count: usize, + observation_count: &mut usize, snapshot_interval: Option, -) -> Result> { - let mut builder = ArchiveBuilder::new(); - builder.current_state = current_state.clone(); - builder.initial_state = Some(current_state); - - for file_path in new_files.iter() { - // Write comment - let comment = format!("# Processing file: {:?}\n", file_path.as_ref()); - if let Err(e) = writer.write_all(comment.as_bytes()) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - - // Get file modification time - let file_mtime = match get_file_mtime(file_path) { - Ok(mtime) => mtime, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't get the file modification time: {}", e), - )]); - } - }; - - // Read and parse new state - let content = match std::fs::read_to_string(file_path) { - Ok(content) => content, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the input file: {}", e), - )]); - } - }; - - let state: Value = match serde_json::from_str(&content) { - Ok(state) => state, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse the input file as JSON: {}", e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())]); - } - }; - - // Generate and write observation - if let Some(mut observation) = builder.add_state(state.clone()) { - // Override the timestamp with the file modification time - observation.timestamp = file_mtime; - observation_count += 1; - - // Write observation events - for event in observation.to_events() { - let event_json = match serde_json::to_string(&event) { - Ok(json) => json, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize an event to JSON: {}", e), - )]); - } - }; - - if let Err(e) = writeln!(writer, "{}", event_json) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - } - - // Check if we should write a snapshot - if let Some(interval) = snapshot_interval { - if observation_count > 0 && observation_count % interval == 0 { - let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); - let snapshot = Event::Snapshot { - observation_id: snapshot_id, - timestamp: file_mtime, - object: state.clone(), - }; - - let snapshot_json = match serde_json::to_string(&snapshot) { - Ok(json) => json, - Err(e) => { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't serialize the snapshot to JSON: {}", e), - )]); - } - }; - - if let Err(e) = writeln!(writer, "{}", snapshot_json) { - return Err(vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write to the output: {}", e), - )]); - } - } - } - } - } - - Ok(observation_count) -} - -pub fn create_archive_from_files>( - input_files: &[P], - output_path: P, + current_state: &Value, + filename: &P, source: Option, - snapshot_interval: Option, -) -> Result<(), Vec> { - let mut builder = ArchiveBuilder::new(); - if let Some(source) = source { - builder = builder.with_source(source); - } - if let Some(interval) = snapshot_interval { - builder = builder.with_snapshot_interval(interval); - } +) -> Result { + // Get file modification time + let file_mtime = match get_file_mtime(filename) { + Ok(mtime) => mtime, + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't get the file modification time: {}", e), + )); + } + }; - let first_content = std::fs::read_to_string(&input_files[0]).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the first input file: {}", e), - )] - })?; - - let first_state: Value = serde_json::from_str(&first_content).map_err(|e| { - vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - format!("I couldn't parse the first input file as JSON: {}", e), - ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - let _ = builder.add_state(first_state.clone()); - - let header = Header::new(first_state, builder.source.clone()); - let mut writer = ArchiveWriter::new(&output_path, builder.snapshot_interval)?; - writer.write_header(&header)?; - - for file_path in input_files[1..].iter() { - writer.write_comment(&format!("Processing file: {:?}", file_path.as_ref()))?; - - let content = std::fs::read_to_string(file_path).map_err(|e| { - vec![Diagnostic::fatal( + let content = match std::fs::read_to_string(filename) { + Ok(content) => content, + Err(e) => { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, format!("I couldn't read the input file: {}", e), - )] - })?; + )); + } + }; - let state: Value = serde_json::from_str(&content).map_err(|e| { - vec![Diagnostic::fatal( + let state: Value = match serde_json::from_str(&content) { + Ok(state) => state, + Err(e) => { + return Err(Diagnostic::fatal( DiagnosticCode::InvalidEventJson, format!("I couldn't parse the input file as JSON: {}", e), ) - .with_advice("Make sure the file contains valid JSON.".to_string())] - })?; - - if let Some(observation) = builder.add_state(state.clone()) { - writer.write_observation(observation)?; - - if writer.should_write_snapshot() { - writer.write_snapshot(&state)?; - } - } - } - - writer.finish()?; - Ok(()) -} - -/// This reads the entire compressed archive, writes a new compressed -/// with all old events plus new observations to a temporary file, then -/// two phase commit style replace the original file. -fn append_to_compressed_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - _source: Option, - snapshot_interval: Option, -) -> Vec { - let archive_path = archive_path.as_ref(); - let output_path = output_path.as_ref(); - - // Step 1: Detect compression format and decompress entire file into memory - let mut file = match File::open(archive_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the compressed archive: {}", e), - )]; + .with_advice("Make sure the file contains valid JSON.".to_string())); } }; - let mut magic_bytes = [0u8; 4]; - let bytes_read = match file.read(&mut magic_bytes) { - Ok(n) => n, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - }; - let compression = detect_compression_format(archive_path, &magic_bytes[..bytes_read]); - file.seek(SeekFrom::Start(0)).unwrap(); - - let decompressed_bytes = { - - #[cfg(feature = "compression")] - { - use flate2::read::{GzDecoder, ZlibDecoder}; - use std::io::Read; - - let mut decompressed = Vec::new(); - - match compression { - CompressionFormat::Gzip => { - let mut decoder = GzDecoder::new(file); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress gzip archive: {}", e), - )]; - } - } - CompressionFormat::Zlib => { - let mut decoder = ZlibDecoder::new(file); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress zlib archive: {}", e), - )]; - } - } - CompressionFormat::Zstd => { - let mut decoder = match zstd::stream::read::Decoder::new(file) { - Ok(d) => d, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd decoder: {}", e), - )]; - } - }; - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress zstd archive: {}", e), - )]; - } - } - CompressionFormat::Brotli => { - let mut decoder = brotli::Decompressor::new(file, 4096); - if let Err(e) = decoder.read_to_end(&mut decompressed) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't decompress brotli archive: {}", e), - )]; - } - } - _ => { - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("Unsupported compression format: {:?}", compression), - )]; - } - } - - decompressed - } - - #[cfg(not(feature = "compression"))] - { - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "This build doesn't support compressed archives.".to_string(), - )]; - } - }; - - // Step 2 & 3: Use AppendSeek mode to parse minimally - // The reader will seek backward through the buffer to find snapshot - let reader = match ArchiveReader::new(archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create archive reader: {}", e), - )]; - } - }; - - let read_result = match reader.read(archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the compressed archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive.".to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // Step 4: Write to temp file with compression - let temp_path = generate_temp_filename(output_path); - - #[cfg(feature = "compression")] - { - use flate2::write::{GzEncoder, ZlibEncoder}; - use flate2::Compression; - - // Create temp file with same compression format as original - let temp_file = match File::create(&temp_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create temp file: {}", e), - )]; - } + if *observation_count == 0 { + // TODO: See if we can get rid of this clone on the Value + let header = Header::new(state.clone(), source); + let aw = ArchiveWriter { + observation_count: *observation_count, + filename: filename.as_ref().display().to_string(), }; + aw.write_header(writer, &header)?; + *observation_count += 1; + } else { + let observation_id = format!("obs-{}", Uuid::new_v4()); - // Helper macro to reduce code duplication - macro_rules! write_compressed { - ($encoder:expr) => {{ - // Write all old decompressed bytes - if let Err(e) = $encoder.write_all(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't write old archive data: {}", e), - )]; - } + let diff_result: Vec = diff::diff(current_state, &state, "", &observation_id); - // Write new observations using core logic - match write_observations_to_writer( - &mut $encoder, - read_result.final_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => {} - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - } - - // Finish compression - if let Err(e) = $encoder.finish() { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't finish compression: {}", e), - )]; - } - }}; + let mut observation = Observation::new(observation_id, file_mtime); + for event in diff_result { + observation.add_event(event); } - match compression { - CompressionFormat::Gzip => { - let mut encoder = GzEncoder::new(temp_file, Compression::default()); - write_compressed!(encoder); + *observation_count += 1; + + // Write observation events + for event in observation.to_events() { + let event_json = match serde_json::to_string(&event) { + Ok(json) => json, + Err(e) => { + return Err(Diagnostic::fatal( + DiagnosticCode::InvalidEventJson, + format!("I couldn't serialize an event to JSON: {}", e), + )); + } + }; + + if let Err(e) = writeln!(writer, "{}", event_json) { + return Err(Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't write to the output: {}", e), + )); } - CompressionFormat::Zlib => { - let mut encoder = ZlibEncoder::new(temp_file, Compression::default()); - write_compressed!(encoder); - } - CompressionFormat::Zstd => { - let mut encoder = match zstd::stream::write::Encoder::new(temp_file, 0) { - Ok(e) => e, + } + + // Check if we should write a snapshot + if let Some(interval) = snapshot_interval { + if *observation_count > 0 && *observation_count % interval == 0 { + let snapshot_id = format!("snapshot-{}", Uuid::new_v4()); + let snapshot = Event::Snapshot { + observation_id: snapshot_id, + timestamp: file_mtime, + object: state.clone(), + }; + + let snapshot_json = match serde_json::to_string(&snapshot) { + Ok(json) => json, Err(e) => { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't create zstd encoder: {}", e), - )]; + return Err(Diagnostic::fatal( + DiagnosticCode::InvalidEventJson, + format!("I couldn't serialize the snapshot to JSON: {}", e), + )); } }; - write_compressed!(encoder); - } - CompressionFormat::Brotli => { - // Brotli uses a different API - no finish() method - let mut encoder = brotli::CompressorWriter::new(temp_file, 4096, 11, 22); - - // Write all old decompressed bytes - if let Err(e) = encoder.write_all(&decompressed_bytes) { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( + + if let Err(e) = writeln!(writer, "{}", snapshot_json) { + return Err(Diagnostic::fatal( DiagnosticCode::PathNotFound, - format!("I couldn't write old archive data: {}", e), - )]; + format!("I couldn't write to the output: {}", e), + )); } - - // Write new observations using core logic - match write_observations_to_writer( - &mut encoder, - read_result.final_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => {} - Err(diagnostics) => { - let _ = std::fs::remove_file(&temp_path); - return diagnostics; - } - } - - // Flush the encoder (brotli auto-flushes on drop, but we flush explicitly) - if let Err(e) = encoder.flush() { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't flush brotli compression: {}", e), - )]; - } - // Encoder will be dropped here, completing the compression - } - _ => { - let _ = std::fs::remove_file(&temp_path); - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - format!("Unsupported compression format for writing: {:?}", compression), - )]; } } } - #[cfg(not(feature = "compression"))] - { - let _ = temp_path; - return vec![Diagnostic::fatal( - DiagnosticCode::UnsupportedVersion, - "This build doesn't support compressed archives.".to_string(), - )]; - } - - // Step 5: Atomic replace - match atomic_replace_file(output_path, &temp_path) { - Ok(()) => Vec::new(), - Err(diagnostics) => diagnostics, - } -} - -pub fn append_to_archive, Q: AsRef>( - archive_path: P, - new_files: &[Q], - output_path: P, - source: Option, - snapshot_interval: Option, -) -> Vec { - // Check if the archive is compressed - let is_archive_compressed = match is_compressed(&archive_path) { - Ok(compressed) => compressed, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't check if the archive is compressed: {}", e), - )]; - } - }; - - // If compressed, use the full rewrite strategy - if is_archive_compressed { - return append_to_compressed_archive( - &archive_path, - new_files, - &output_path, - source, - snapshot_interval, - ); - } - - // For uncompressed archives, use the direct append strategy (existing code) - // Read the existing archive to get the final state - let reader = match ArchiveReader::new(&archive_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive for reading: {}", e), - )]; - } - }; - - let read_result = match reader.read(&archive_path) { - Ok(result) => result, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive: {}", e), - )]; - } - }; - - // Check for fatal diagnostics in the archive - if read_result.diagnostics.has_fatal() { - let mut diagnostics = vec![Diagnostic::fatal( - DiagnosticCode::InvalidEventJson, - "The existing archive contains fatal errors. Cannot append to a corrupt archive.".to_string(), - )]; - diagnostics.extend(read_result.diagnostics.into_diagnostics()); - return diagnostics; - } - - // If output path is different from archive path, copy the archive first - if archive_path.as_ref() != output_path.as_ref() { - if let Err(e) = std::fs::copy(&archive_path, &output_path) { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't copy the archive to the output location: {}", e), - )]; - } - } - - // Open file in append mode - let mut file = match OpenOptions::new().append(true).open(&output_path) { - Ok(f) => f, - Err(e) => { - return vec![Diagnostic::fatal( - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file for appending: {}", e), - ) - .with_advice( - "Make sure the archive file exists and you have write permission.".to_string() - )]; - } - }; - - // Use core writing logic - let current_state = read_result.final_state; - match write_observations_to_writer( - &mut file, - current_state, - new_files, - read_result.observation_count, - snapshot_interval, - ) { - Ok(_) => Vec::new(), - Err(diagnostics) => diagnostics, - } + Ok(state) } #[cfg(test)] mod tests { use super::*; use serde_json::json; - use std::io::Write; + use std::fs::File; + use std::io::{BufWriter, Write}; use tempfile::NamedTempFile; + /// Helper to create a temp file with JSON content + fn create_json_file(content: &Value) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "{}", serde_json::to_string(content).unwrap()).unwrap(); + file + } + #[test] - fn test_archive_writer_header() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; - let header = Header::new(json!({"test": "value"}), Some("test-source".to_string())); + fn test_single_file_creates_header_only() { + // When we have a single input file, the archive should contain just the header + // with that file's contents as the initial state + let input = create_json_file(&json!({"test": "value"})); + let output = NamedTempFile::new().unwrap(); + + let input_files = vec![input.path().to_path_buf()]; { - let mut writer = ArchiveWriter::new(temp_file.path(), None) - .map_err(|_| "Failed to create writer")?; - writer - .write_header(&header) - .map_err(|_| "Failed to write header")?; - writer.finish().map_err(|_| "Failed to finish")?; + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + file_path, + Some("test-source".to_string()), + ) + .unwrap(); + } + writer.flush().unwrap(); } - let content = std::fs::read_to_string(temp_file.path())?; - let lines: Vec<&str> = content.lines().collect(); + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); assert_eq!(lines.len(), 1); - let parsed_header: Header = serde_json::from_str(lines[0])?; + let parsed_header: Header = serde_json::from_str(lines[0]).unwrap(); assert_eq!(parsed_header.file_type, "@peoplesgrocers/json-archive"); assert_eq!(parsed_header.version, 1); assert_eq!(parsed_header.initial, json!({"test": "value"})); - - Ok(()) } #[test] - fn test_archive_builder() -> Result<(), Box> { - let mut builder = ArchiveBuilder::new(); + fn test_two_files_creates_header_and_observation() { + // When we have two input files, the first becomes the header's initial state + // and the second generates change events + let file1 = create_json_file(&json!({"count": 0, "name": "test"})); + let file2 = create_json_file(&json!({"count": 1, "name": "test"})); + let output = NamedTempFile::new().unwrap(); - // First state becomes initial - let result = builder.add_state(json!({"count": 0})); - assert!(result.is_none()); + let input_files = vec![file1.path().to_path_buf(), file2.path().to_path_buf()]; - // Second state generates observation - let observation = builder - .add_state(json!({"count": 1})) - .expect("Should generate observation"); - assert!(!observation.events.is_empty()); + { + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; - Ok(()) - } + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + file_path, + Some("test-source".to_string()), + ) + .unwrap(); + } + writer.flush().unwrap(); + } - #[test] - fn test_create_archive_from_files() -> Result<(), Box> { - // Create temporary input files - let mut file1 = NamedTempFile::new()?; - let mut file2 = NamedTempFile::new()?; - let output_file = NamedTempFile::new()?; + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); - writeln!(file1, r#"{{"count": 0, "name": "test"}}"#)?; - writeln!(file2, r#"{{"count": 1, "name": "test"}}"#)?; - - let input_files = vec![file1.path(), file2.path()]; - - create_archive_from_files( - &input_files, - output_file.path(), - Some("test-source".to_string()), - None, - ) - .map_err(|_| "Failed to create archive")?; - - let content = std::fs::read_to_string(output_file.path())?; - let lines: Vec<&str> = content.lines().collect(); - - assert!(lines.len() >= 2); // At least header + comment + observe + change events + // Should have header + observe event + at least one change event + assert!(lines.len() >= 2); // First line should be header - let header: Header = serde_json::from_str(lines[0])?; + let header: Header = serde_json::from_str(lines[0]).unwrap(); assert_eq!(header.file_type, "@peoplesgrocers/json-archive"); assert_eq!(header.version, 1); assert_eq!(header.initial, json!({"count": 0, "name": "test"})); - - Ok(()) } #[test] - fn test_snapshot_interval() -> Result<(), Box> { - let temp_file = NamedTempFile::new()?; - let mut writer = - ArchiveWriter::new(temp_file.path(), Some(2)).map_err(|_| "Failed to create writer")?; + fn test_snapshot_written_at_interval() { + // When snapshot_interval is set, a snapshot should be written every N observations + let file1 = create_json_file(&json!({"count": 0})); + let file2 = create_json_file(&json!({"count": 1})); + let file3 = create_json_file(&json!({"count": 2})); + let output = NamedTempFile::new().unwrap(); - assert!(!writer.should_write_snapshot()); // No observations yet + let input_files = vec![ + file1.path().to_path_buf(), + file2.path().to_path_buf(), + file3.path().to_path_buf(), + ]; - let obs1 = Observation::new("obs-1".to_string(), Utc::now()); - writer - .write_observation(obs1) - .map_err(|_| "Failed to write observation")?; - assert!(!writer.should_write_snapshot()); // 1 observation, interval is 2 + { + let file = File::create(output.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + let snapshot_interval = Some(2); - let obs2 = Observation::new("obs-2".to_string(), Utc::now()); - writer - .write_observation(obs2) - .map_err(|_| "Failed to write observation")?; - assert!(writer.should_write_snapshot()); // 2 observations, should snapshot + for file_path in &input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + snapshot_interval, + ¤t_state, + file_path, + None, + ) + .unwrap(); + } + writer.flush().unwrap(); + } - Ok(()) + let content = std::fs::read_to_string(output.path()).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); + + // Look for a snapshot event + let has_snapshot = lines.iter().any(|line| { + if let Ok(event) = serde_json::from_str::(line) { + matches!(event, Event::Snapshot { .. }) + } else { + false + } + }); + + assert!( + has_snapshot, + "Expected a snapshot event after 2 observations" + ); } #[test] diff --git a/src/atomic_file.rs b/src/atomic_file.rs index 8b060bc..ffee8ae 100644 --- a/src/atomic_file.rs +++ b/src/atomic_file.rs @@ -110,7 +110,10 @@ pub fn generate_temp_filename>(path: P) -> PathBuf { /// /// Returns diagnostics if any step of the operation fails. The function /// attempts automatic recovery by restoring the backup if the replacement fails. -pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Result<(), Vec> { +pub fn atomic_replace_file>( + original_path: P, + temp_path: P, +) -> Result<(), Vec> { let original = original_path.as_ref(); let temp = temp_path.as_ref(); @@ -119,12 +122,13 @@ pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Re if let Some(filename_str) = filename.to_str() { // Extract random suffix from temp filename if it follows our pattern let temp_filename = temp.file_name().and_then(|f| f.to_str()).unwrap_or(""); - let random_suffix = if temp_filename.starts_with('.') && temp_filename.contains(filename_str) { - // Extract suffix after the original filename - temp_filename.rsplit('.').next().unwrap_or("backup") - } else { - "backup" - }; + let random_suffix = + if temp_filename.starts_with('.') && temp_filename.contains(filename_str) { + // Extract suffix after the original filename + temp_filename.rsplit('.').next().unwrap_or("backup") + } else { + "backup" + }; let backup_filename = format!(".{}.{}.old", filename_str, random_suffix); if let Some(parent) = original.parent() { @@ -148,7 +152,7 @@ pub fn atomic_replace_file>(original_path: P, temp_path: P) -> Re ) .with_advice( "Make sure you have write permission in this directory and sufficient disk space." - .to_string() + .to_string(), )]); } diff --git a/src/bin/pointer_errors_demo.rs b/src/bin/pointer_errors_demo.rs index 9f95c45..dc2c8cd 100644 --- a/src/bin/pointer_errors_demo.rs +++ b/src/bin/pointer_errors_demo.rs @@ -15,7 +15,8 @@ fn print_example(pointer_str: &str, value: &mut serde_json::Value) { } fn main() { - print!(r#" + print!( + r#" # JSON Pointer Diagnostics @@ -52,7 +53,8 @@ or submit a pull request. Key doesn't exist in the object. Shows available keys and suggests typos. -"#); +"# + ); print_example( "/user/emial", @@ -65,13 +67,15 @@ Key doesn't exist in the object. Shows available keys and suggests typos. }), ); - print!(r#" + print!( + r#" ## Type Mismatch Tried to index into a value that doesn't support it (e.g., `/domain` on a string, `/0` on a number). Shows the actual type. -"#); +"# + ); print_example( "/users/0/email/domain", @@ -82,12 +86,14 @@ Tried to index into a value that doesn't support it (e.g., `/domain` on a string }), ); - print!(r#" + print!( + r#" ## Array Index Out of Bounds Index past the end of the array. Shows the array length. -"#); +"# + ); print_example( "/items/5", @@ -96,12 +102,14 @@ Index past the end of the array. Shows the array length. }), ); - print!(r#" + print!( + r#" ## Array Index If you think you have an object but you're actually indexing into an array, you'll see this error. -"#); +"# + ); print_example( "/items/foo", @@ -110,13 +118,15 @@ If you think you have an object but you're actually indexing into an array, you' }), ); - print!(r#" + print!( + r#" ## Deep Path Failures For long paths, the underline shows which segment failed. The full path remains visible so you can see what you were trying to reach. -"#); +"# + ); print_example( "/data/users/0/profile/settings/theme", diff --git a/src/cmd/info.rs b/src/cmd/info.rs index 15b8502..093d189 100644 --- a/src/cmd/info.rs +++ b/src/cmd/info.rs @@ -21,7 +21,9 @@ use crate::flags; use chrono::{DateTime, Utc}; -use json_archive::{Diagnostic, DiagnosticCode, DiagnosticLevel, Event}; +use json_archive::archive_open::open_archive; +use json_archive::detection::CompressionFormat; +use json_archive::{read_events, Diagnostic, DiagnosticCode, DiagnosticLevel, Event}; use serde::Serialize; use std::path::Path; @@ -46,6 +48,7 @@ struct JsonObservation { #[derive(Serialize)] struct JsonInfoOutput { archive: String, + compression: String, created: String, file_size: u64, snapshot_count: usize, @@ -54,9 +57,9 @@ struct JsonInfoOutput { efficiency_percent: f64, } -pub fn run(flags: &flags::Info) -> Vec { +pub fn run(flags: &flags::Info) -> Result<(), Vec> { if !flags.file.exists() { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::PathNotFound, format!("I couldn't find the archive file: {}", flags.file.display()), @@ -65,12 +68,13 @@ pub fn run(flags: &flags::Info) -> Vec { "Make sure the file path is correct and the file exists. \ Check for typos in the filename." .to_string(), - )]; + )]); } - let (observations, snapshot_count) = match collect_observations(&flags.file) { - Ok((obs, count)) => (obs, count), - Err(diagnostics) => return diagnostics, + let (observations, snapshot_count, compression_format) = match collect_observations(&flags.file) + { + Ok((obs, count, format)) => (obs, count, format), + Err(diagnostics) => return Err(diagnostics), }; let file_size = match std::fs::metadata(&flags.file) { @@ -79,7 +83,10 @@ pub fn run(flags: &flags::Info) -> Vec { }; // Calculate total JSON size (sum of all observations + newline separators) - let total_json_size: u64 = observations.iter().map(|obs| obs.json_size as u64).sum::() + let total_json_size: u64 = observations + .iter() + .map(|obs| obs.json_size as u64) + .sum::() + (observations.len() as u64).saturating_sub(1); // Add newlines between observations let efficiency_percent = if total_json_size > 0 { @@ -96,6 +103,7 @@ pub fn run(flags: &flags::Info) -> Vec { if observations.is_empty() { let empty_output = JsonInfoOutput { archive: flags.file.display().to_string(), + compression: compression_format.to_string(), created: "".to_string(), file_size, snapshot_count, @@ -107,7 +115,7 @@ pub fn run(flags: &flags::Info) -> Vec { "{}", serde_json::to_string_pretty(&empty_output).unwrap_or_default() ); - return Vec::new(); + return Ok(()); } let json_observations: Vec = observations @@ -128,6 +136,7 @@ pub fn run(flags: &flags::Info) -> Vec { let json_output = JsonInfoOutput { archive: flags.file.display().to_string(), + compression: compression_format.to_string(), created: observations[0].created.to_rfc3339(), file_size, snapshot_count, @@ -143,10 +152,11 @@ pub fn run(flags: &flags::Info) -> Vec { } else { // Human-readable output mode println!("Archive: {}", flags.file.display()); + println!("Compression: {}", compression_format); if observations.is_empty() { println!("No observations found"); - return Vec::new(); + return Ok(()); } let first_timestamp = &observations[0].created; @@ -217,56 +227,26 @@ pub fn run(flags: &flags::Info) -> Vec { snapshot_text, comparison ); - println!( - "Data size: {}", - format_size(total_json_size) - ); + println!("Data size: {}", format_size(total_json_size)); // Add usage instructions println!(); println!("To get the JSON value at a specific observation:"); - println!(" json-archive state --index <#> {}", flags.file.display()); - println!( - " json-archive state --id {}", - flags.file.display() - ); - println!(); - println!("Examples:"); - println!( - " json-archive state --index 0 {} # Get initial state", - flags.file.display() - ); - println!( - " json-archive state --index 2 {} # Get state after observation 2", - flags.file.display() - ); + println!(" json-archive state --index <#> "); + println!(" json-archive state --id "); } - Vec::new() + Ok(()) } -fn collect_observations(file_path: &Path) -> Result<(Vec, usize), Vec> { - let reader = match json_archive::ArchiveReader::new(file_path, json_archive::ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file: {}", e), - )]); - } - }; +fn collect_observations( + file_path: &Path, +) -> Result<(Vec, usize, CompressionFormat), Vec> { + let opened = open_archive(file_path)?; + let compression_format = opened.format; - let (initial_state, mut event_iter) = match reader.events(file_path) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive file: {}", e), - )]); - } - }; + let (initial_state, mut event_iter) = + read_events(opened.reader, &file_path.display().to_string())?; // Check for fatal diagnostics from initial parsing if event_iter.diagnostics.has_fatal() { @@ -295,7 +275,11 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize // Iterate through events while let Some(event) = event_iter.next() { match event { - Event::Observe { observation_id, timestamp, change_count } => { + Event::Observe { + observation_id, + timestamp, + change_count, + } => { observations.push(ObservationInfo { id: observation_id, timestamp, @@ -316,7 +300,9 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize } } } - Event::Change { path, new_value, .. } => { + Event::Change { + path, new_value, .. + } => { let _ = json_archive::apply_change(&mut current_state, &path, new_value); // Update the JSON size of the last observation @@ -368,10 +354,9 @@ fn collect_observations(file_path: &Path) -> Result<(Vec, usize } } - Ok((observations, snapshot_count)) + Ok((observations, snapshot_count, compression_format)) } - fn format_timestamp(dt: &DateTime) -> String { dt.format("%a %H:%M:%S %d-%b-%Y").to_string() } diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index e54432a..6ac4eb6 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -21,3 +21,4 @@ pub mod info; pub mod state; +pub mod write; diff --git a/src/cmd/state.rs b/src/cmd/state.rs index 92da1b0..5aebc6a 100644 --- a/src/cmd/state.rs +++ b/src/cmd/state.rs @@ -21,7 +21,11 @@ use crate::flags; use chrono::{DateTime, Utc}; -use json_archive::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, Diagnostic, DiagnosticCode, DiagnosticLevel, Event, ReadMode}; +use json_archive::archive_open::open_archive; +use json_archive::{ + apply_add, apply_change, apply_move, apply_remove, read_events, Diagnostic, DiagnosticCode, + DiagnosticLevel, Event, +}; use serde_json::Value; use std::path::Path; @@ -35,9 +39,9 @@ enum AccessMethod { Latest, } -pub fn run(flags: &flags::State) -> Vec { +pub fn run(flags: &flags::State) -> Result<(), Vec> { if !flags.file.exists() { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::PathNotFound, format!("I couldn't find the archive file: {}", flags.file.display()), @@ -46,34 +50,34 @@ pub fn run(flags: &flags::State) -> Vec { "Make sure the file path is correct and the file exists. \ Check for typos in the filename." .to_string(), - )]; + )]); } // Parse and validate flags - ensure only one access method is specified let access_method = match parse_access_method(flags) { Ok(method) => method, - Err(diagnostic) => return vec![diagnostic], + Err(diagnostic) => return Err(vec![diagnostic]), }; // Find and replay to the target observation let target_state = match find_and_replay_to_target(&flags.file, &access_method) { Ok(state) => state, - Err(diagnostics) => return diagnostics, + Err(diagnostics) => return Err(diagnostics), }; // Output the JSON state match serde_json::to_string_pretty(&target_state) { Ok(json) => println!("{}", json), Err(e) => { - return vec![Diagnostic::new( + return Err(vec![Diagnostic::new( DiagnosticLevel::Fatal, DiagnosticCode::InvalidEventJson, format!("I couldn't serialize the state to JSON: {}", e), - )]; + )]); } } - Vec::new() + Ok(()) } fn parse_access_method(flags: &flags::State) -> Result { @@ -151,27 +155,10 @@ fn find_and_replay_to_target( file_path: &Path, access_method: &AccessMethod, ) -> Result> { - let reader = match ArchiveReader::new(file_path, ReadMode::AppendSeek) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't open the archive file: {}", e), - )]); - } - }; + let opened = open_archive(file_path)?; - let (initial_state, mut event_iter) = match reader.events(file_path) { - Ok(r) => r, - Err(e) => { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't read the archive file: {}", e), - )]); - } - }; + let (initial_state, mut event_iter) = + read_events(opened.reader, &file_path.display().to_string())?; // Check for fatal diagnostics from initial parsing if event_iter.diagnostics.has_fatal() { @@ -193,7 +180,11 @@ fn find_and_replay_to_target( // Process events and track state at each observation while let Some(event) = event_iter.next() { match event { - Event::Observe { observation_id, timestamp, change_count: _ } => { + Event::Observe { + observation_id, + timestamp, + change_count: _, + } => { observations.push(ObservationWithEvents { id: observation_id, timestamp, @@ -210,7 +201,9 @@ fn find_and_replay_to_target( } } } - Event::Change { path, new_value, .. } => { + Event::Change { + path, new_value, .. + } => { let _ = apply_change(&mut current_state, &path, new_value); // Update the final state of the last observation diff --git a/src/cmd/write.rs b/src/cmd/write.rs new file mode 100644 index 0000000..d513c18 --- /dev/null +++ b/src/cmd/write.rs @@ -0,0 +1,316 @@ +use crate::flags; +use json_archive::archive_open::{check_compression_support, open_archive}; +use json_archive::archive_reader::{read_archive, ReadMode}; +use json_archive::archive_writer::{default_output_filename, write_observation}; +use json_archive::atomic_file::atomic_replace_file; +use json_archive::compression_writer::CompressionWriter; +use json_archive::detection::CompressionFormat; +use json_archive::write_strategy::{determine_strategy, WriteStrategy}; +use json_archive::{is_json_archive, Diagnostic, DiagnosticCode, DiagnosticLevel}; + +use serde_json::Value; +use std::fs::{File, OpenOptions}; +use std::io::{BufWriter, Write}; +use std::path::{Path, PathBuf}; + +pub fn run(flags: &flags::Write) -> Result<(), Vec> { + let (input_files, strategy) = parse_flags(flags)?; + + assert!(!input_files.is_empty()); + + match strategy { + WriteStrategy::Create { + output: (dest, dest_fmt), + } => { + check_compression_support(dest_fmt, &dest, "write")?; + + println!("Creating new archive: {}", dest.display()); + println!("Input files: {:?}", input_files); + + // Create the writer - on error, no file cleanup needed since create failed + let mut writer = CompressionWriter::create(&dest, dest_fmt)?; + + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + for file in input_files { + // TODO: On write error, we need to clean up the partially written file ourselves + current_state = write_observation( + &mut writer, + &mut observation_count, + flags.snapshot_interval, + ¤t_state, + &file, + flags.source.clone(), + )?; + } + + // Finalize compression and flush buffers. + // Note: finish() does not clean up the file on error - caller must + // remove the file themselves if this fails. + if let Err(diagnostics) = writer.finish() { + let _ = std::fs::remove_file(&dest); + return Err(diagnostics); + } + + println!("Archive created successfully: {}", dest.display()); + Ok(()) + } + WriteStrategy::Append { path } => { + let opened = open_archive(&path)?; + let read_result = read_archive( + opened.reader, + &path.display().to_string(), + ReadMode::AppendSeek, + )?; + + if read_result.diagnostics.has_fatal() { + return Err(read_result.diagnostics.into_diagnostics()); + } + + let mut current_state = read_result.final_state; + // observation_count starts at existing count + 1 (header counts as first observation) + let mut observation_count = read_result.observation_count + 1; + + // Note, we are reopening the same file for appending. So getting a new file descriptor + let mut writer = BufWriter::new(open_for_appending(&path)?); + + for filename in input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + flags.snapshot_interval, + ¤t_state, + &filename, + flags.source.clone(), + )?; + } + + writer.flush().map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't flush the output: {}", e), + ) + })?; + + Ok(()) + } + WriteStrategy::CopyOnWrite { + input: (src, fmt_src), + output: (dest, fmt_dest), + } => { + assert!(src != dest); + check_compression_support(fmt_src, &src, "read")?; + check_compression_support(fmt_dest, &dest, "write")?; + + copy_and_append( + &src, + &dest, + fmt_dest, + &input_files, + flags.snapshot_interval, + flags.source.clone(), + ) + } + WriteStrategy::AtomicSwap { + path, + compression: format, + temp_path, + } => { + assert!(path != temp_path); + check_compression_support(format, &path, "read")?; + + copy_and_append( + &path, + &temp_path, + format, + &input_files, + flags.snapshot_interval, + flags.source.clone(), + )?; + + atomic_replace_file(&path, &temp_path) + } + } +} + +fn open_for_appending(path: &Path) -> Result> { + let file: File = OpenOptions::new().append(true).open(&path).map_err(|e| { + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't open the archive file for appending: {}", e), + ) + .with_advice("Make sure the archive file exists and you have write permission.".to_string()) + })?; + Ok(file) +} + +/// Copy an archive from source to destination, then append new observations. +/// +/// This handles decompression of the source and compression of the destination +/// transparently. On error, the destination file is removed. +fn copy_and_append( + src: &Path, + dest: &Path, + dest_fmt: CompressionFormat, + input_files: &[PathBuf], + snapshot_interval: Option, + source: Option, +) -> Result<(), Vec> { + assert!(src != dest); + let opened = open_archive(src)?; + let mut reader = opened.reader; + + // Create destination writer (handles compression) + let mut writer = CompressionWriter::create(dest, dest_fmt)?; + + // Copy all decompressed bytes to the new (possibly compressed) destination + std::io::copy(&mut reader, &mut writer).map_err(|e| { + let _ = std::fs::remove_file(dest); + Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!("I couldn't copy the archive contents: {}", e), + ) + })?; + + // Read the archive to get final state for appending + let opened = open_archive(src)?; + let read_result = read_archive( + opened.reader, + &src.display().to_string(), + ReadMode::AppendSeek, + )?; + + if read_result.diagnostics.has_fatal() { + let _ = std::fs::remove_file(dest); + return Err(read_result.diagnostics.into_diagnostics()); + } + + let mut current_state = read_result.final_state; + let mut observation_count = read_result.observation_count + 1; + + // Append new observations + for filename in input_files { + current_state = write_observation( + &mut writer, + &mut observation_count, + snapshot_interval, + ¤t_state, + filename, + source.clone(), + )?; + } + + // Finalize compression and flush buffers + if let Err(diagnostics) = writer.finish() { + let _ = std::fs::remove_file(dest); + return Err(diagnostics); + } + + Ok(()) +} + +/// Parse the CLI arguments to determine the destination archive and input files. +/// This consolidates all the inferring behavior in one place. +fn parse_flags(flags: &flags::Write) -> Result<(Vec, WriteStrategy), Vec> { + let mut diagnostics = Vec::new(); + if flags.inputs.is_empty() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::MissingHeaderField, + "I need at least one JSON file to create an archive, but you didn't provide any." + .to_string(), + ) + .with_advice( + "Usage: json-archive [file2.json ...]\n\n\ + The first file will be used as the initial state, and subsequent files \ + will be compared to generate change events." + .to_string(), + ), + ); + return Err(diagnostics); + } + + // I figured it would be a helpful bit of automation on behalf of the human + // user for this tool to validate all input files exist + for file in flags.inputs.iter() { + if !file.exists() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::PathNotFound, + format!("I couldn't find the input file: {}", file.display()), + ) + .with_advice( + "Make sure the file path is correct and the file exists. \ + Check for typos in the filename." + .to_string(), + ), + ); + } + } + + let source_archive: Option = if Path::new(&flags.inputs[0]).exists() + && is_json_archive(&flags.inputs[0]).unwrap_or(false) + { + Some(flags.inputs[0].clone()) + } else { + None + }; + + // Determine the destination archive path + let destination = if let Some(output) = &flags.output { + // Explicitly specified output path + output.clone() + } else if source_archive.is_some() { + source_archive.clone().unwrap() + } else { + // Infer from first input + default_output_filename(&flags.inputs[0]) + }; + + // Filter out the destination from input files to avoid read-write conflicts + let input_files: Vec<_> = flags + .inputs + .iter() + .filter(|path| { + match ( + std::fs::canonicalize(path).ok(), + std::fs::canonicalize(&destination).ok(), + ) { + (Some(p), Some(d)) => p != d, + _ => true, // Include if canonicalization fails (file doesn't exist yet) + } + }) + .cloned() + .collect(); + + if input_files.is_empty() { + diagnostics.push( + Diagnostic::new( + DiagnosticLevel::Fatal, + DiagnosticCode::MissingHeaderField, + "No input files remain after filtering out the destination archive.".to_string() + ) + .with_advice( + "You specified the output path in the list of input files. This would cause a read-write conflict.\n\ + Either remove the output path from inputs, or use a different output path with -o." + .to_string() + ) + ); + return Err(diagnostics); + } + + if !diagnostics.is_empty() { + return Err(diagnostics); + } + + Ok(( + input_files, + determine_strategy( + source_archive.as_deref(), + &destination, + CompressionFormat::None, + ), + )) +} diff --git a/src/compression_writer.rs b/src/compression_writer.rs new file mode 100644 index 0000000..cfac557 --- /dev/null +++ b/src/compression_writer.rs @@ -0,0 +1,431 @@ +// json-archive is a tool for tracking JSON file changes over time +// Copyright (C) 2025 Peoples Grocers LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// +// To purchase a license under different terms contact admin@peoplesgrocers.com +// To request changes, report bugs, or give user feedback contact +// marxism@peoplesgrocers.com +// + +//! Unified writer abstraction for compressed and uncompressed output. +//! +//! This module provides `CompressionWriter`, an enum that wraps different +//! compression encoders behind a common interface implementing `std::io::Write`. +//! +//! The goal is to simplify write logic by allowing callers to write to any +//! compression format using the same API, with proper error handling that +//! produces user-friendly diagnostics. + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use crate::detection::CompressionFormat; +use crate::diagnostics::{Diagnostic, DiagnosticCode}; + +/// A writer that handles optional compression transparently. +/// +/// Wraps different compression encoders behind a unified interface +/// that implements `Write` and provides a `finish()` method for cleanup. +/// +/// # Example +/// +/// ```ignore +/// use json_archive::compression_writer::CompressionWriter; +/// use json_archive::detection::CompressionFormat; +/// use std::io::Write; +/// +/// let mut writer = CompressionWriter::create(path, CompressionFormat::Gzip)?; +/// writeln!(writer, "some data")?; +/// writer.finish()?; +/// ``` +// Note: Cannot derive Debug because compression encoder types don't implement Debug +pub enum CompressionWriter { + /// Uncompressed output - uses BufWriter since File has no internal buffering + Plain(BufWriter), + /// Compression encoders write directly to File - they do their own internal buffering + #[cfg(feature = "compression")] + Gzip(flate2::write::GzEncoder), + #[cfg(feature = "compression")] + Zlib(flate2::write::ZlibEncoder), + #[cfg(feature = "compression")] + Zstd(zstd::stream::write::Encoder<'static, File>), + #[cfg(feature = "compression")] + Brotli(brotli::CompressorWriter), +} + +impl CompressionWriter { + /// Open a file for writing with the specified compression format. + /// + /// # Errors + /// + /// Returns a diagnostic explaining: + /// - What file we tried to create + /// - What compression format was requested + /// - Why it failed (permissions, disk full, unsupported format, etc.) + pub fn create(path: &Path, format: CompressionFormat) -> Result> { + let file = File::create(path).map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't create the output file '{}': {}", + path.display(), + describe_io_error(&e) + ), + ) + .with_advice(advice_for_create_error(&e, path))] + })?; + + match format { + // Plain needs BufWriter since File has no internal buffering + CompressionFormat::None => Ok(Self::Plain(BufWriter::new(file))), + + // Compression encoders do their own buffering, write directly to File + #[cfg(feature = "compression")] + CompressionFormat::Gzip => { + use flate2::write::GzEncoder; + use flate2::Compression; + Ok(Self::Gzip(GzEncoder::new(file, Compression::default()))) + } + + #[cfg(feature = "compression")] + CompressionFormat::Zlib => { + use flate2::write::ZlibEncoder; + use flate2::Compression; + Ok(Self::Zlib(ZlibEncoder::new(file, Compression::default()))) + } + + #[cfg(feature = "compression")] + CompressionFormat::Deflate => { + // Deflate is a raw compression algorithm, not a container format. + // We can read deflate data, but when writing we need to pick a + // container (gzip or zlib) that provides headers and checksums. + Err(vec![Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + "I can't write raw deflate format because it's not a container format.".to_string(), + ) + .with_advice( + "Deflate is a compression algorithm, not a file format. When writing, \ + you need to choose a container format that wraps deflate data:\n\ + \n - Use .gz (gzip) for general-purpose compression\n \ + - Use .zlib for zlib-wrapped deflate\n\ + \nIf you're appending to an existing deflate file, consider converting \ + it to gzip first.".to_string() + )]) + } + + #[cfg(feature = "compression")] + CompressionFormat::Zstd => { + let encoder = zstd::stream::write::Encoder::new(file, 0).map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't initialize zstd compression for '{}': {}", + path.display(), + e + ), + )] + })?; + Ok(Self::Zstd(encoder)) + } + + #[cfg(feature = "compression")] + CompressionFormat::Brotli => { + // buffer_size=4096, quality=11 (max), lgwin=22 (default window) + Ok(Self::Brotli(brotli::CompressorWriter::new( + file, 4096, 11, 22, + ))) + } + + #[cfg(not(feature = "compression"))] + _ => Err(vec![Diagnostic::fatal( + DiagnosticCode::UnsupportedVersion, + format!( + "I can't write {} compressed files because this build doesn't include compression support.", + format_name(format) + ), + ) + .with_advice("Rebuild with: cargo build --features compression".to_string())]), + } + } + + /// Finish writing and flush all buffers. + /// + /// For compressed formats, this finalizes the compression stream. + /// Must be called before dropping to ensure all data is written. + /// + /// # Errors + /// + /// Returns a diagnostic if flushing or finalizing fails. + /// + /// **Important**: This method does not clean up the output file on error. + /// If `finish()` fails, the caller is responsible for removing the + /// partially-written file themselves: + /// + /// ```ignore + /// if let Err(diagnostics) = writer.finish() { + /// let _ = std::fs::remove_file(&path); + /// return Err(diagnostics); + /// } + /// ``` + pub fn finish(self) -> Result<(), Vec> { + match self { + Self::Plain(mut w) => w.flush().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't flush the output file: {}", + describe_io_error(&e) + ), + )] + }), + + #[cfg(feature = "compression")] + Self::Gzip(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize gzip compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Zlib(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize zlib compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Zstd(encoder) => { + encoder.finish().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize zstd compression: {}", + describe_io_error(&e) + ), + )] + })?; + Ok(()) + } + + #[cfg(feature = "compression")] + Self::Brotli(mut encoder) => { + // Brotli uses a different API - no finish() method + // Flush the encoder (brotli auto-flushes on drop, but we flush explicitly) + encoder.flush().map_err(|e| { + vec![Diagnostic::fatal( + DiagnosticCode::PathNotFound, + format!( + "I couldn't finalize brotli compression: {}", + describe_io_error(&e) + ), + )] + }) + } + } + } +} + +impl Write for CompressionWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + match self { + Self::Plain(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Gzip(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Zlib(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Zstd(w) => w.write(buf), + #[cfg(feature = "compression")] + Self::Brotli(w) => w.write(buf), + } + } + + fn flush(&mut self) -> std::io::Result<()> { + match self { + Self::Plain(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Gzip(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Zlib(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Zstd(w) => w.flush(), + #[cfg(feature = "compression")] + Self::Brotli(w) => w.flush(), + } + } +} + +/// Translate io::Error into human-readable descriptions. +fn describe_io_error(e: &std::io::Error) -> String { + match e.kind() { + std::io::ErrorKind::NotFound => "the directory doesn't exist".to_string(), + std::io::ErrorKind::PermissionDenied => "permission denied".to_string(), + std::io::ErrorKind::AlreadyExists => { + "a directory with that name already exists".to_string() + } + std::io::ErrorKind::StorageFull => "the disk is full".to_string(), + std::io::ErrorKind::ReadOnlyFilesystem => "the filesystem is read-only".to_string(), + _ => e.to_string(), + } +} + +/// Generate helpful advice based on the error type. +fn advice_for_create_error(e: &std::io::Error, path: &Path) -> String { + match e.kind() { + std::io::ErrorKind::NotFound => { + if let Some(parent) = path.parent() { + format!( + "The parent directory '{}' doesn't exist. Create it first with:\n mkdir -p '{}'", + parent.display(), + parent.display() + ) + } else { + "Check that the path is valid.".to_string() + } + } + std::io::ErrorKind::PermissionDenied => { + format!( + "You don't have write permission for this location. Try:\n ls -la '{}'", + path.parent() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| ".".to_string()) + ) + } + std::io::ErrorKind::StorageFull => { + "Free up disk space or write to a different location.".to_string() + } + _ => "Check that the path is valid and you have write permission.".to_string(), + } +} + +/// Get a human-readable name for a compression format. +#[cfg(not(feature = "compression"))] +fn format_name(format: CompressionFormat) -> &'static str { + match format { + CompressionFormat::Gzip => "gzip", + CompressionFormat::Zlib => "zlib", + CompressionFormat::Zstd => "zstd", + CompressionFormat::Brotli => "brotli", + CompressionFormat::Deflate => "deflate", + CompressionFormat::None => "uncompressed", + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Read; + use tempfile::NamedTempFile; + + #[test] + fn test_plain_writer() -> Result<(), Box> { + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::None) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello world").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + let content = std::fs::read_to_string(path)?; + assert_eq!(content, "hello world\n"); + Ok(()) + } + + #[test] + #[cfg(feature = "compression")] + fn test_gzip_writer() -> Result<(), Box> { + use flate2::read::GzDecoder; + + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::Gzip) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello gzip").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + // Verify by decompressing + let file = File::open(path)?; + let mut decoder = GzDecoder::new(file); + let mut content = String::new(); + decoder.read_to_string(&mut content)?; + assert_eq!(content, "hello gzip\n"); + Ok(()) + } + + #[test] + #[cfg(feature = "compression")] + fn test_zstd_writer() -> Result<(), Box> { + let temp_file = NamedTempFile::new()?; + let path = temp_file.path(); + + { + let mut writer = CompressionWriter::create(path, CompressionFormat::Zstd) + .map_err(|d| format!("{:?}", d))?; + writeln!(writer, "hello zstd").map_err(|e| format!("{}", e))?; + writer.finish().map_err(|d| format!("{:?}", d))?; + } + + // Verify by decompressing + let file = File::open(path)?; + let mut decoder = zstd::stream::read::Decoder::new(file)?; + let mut content = String::new(); + decoder.read_to_string(&mut content)?; + assert_eq!(content, "hello zstd\n"); + Ok(()) + } + + #[test] + fn test_create_nonexistent_directory() { + let result = CompressionWriter::create( + Path::new("/nonexistent/directory/file.txt"), + CompressionFormat::None, + ); + match result { + Ok(_) => panic!("Expected error for nonexistent directory"), + Err(diagnostics) => { + assert_eq!(diagnostics.len(), 1); + // The error message should mention the path + assert!( + diagnostics[0] + .description + .contains("/nonexistent/directory/file.txt"), + "Expected path in error message, got: {}", + diagnostics[0].description + ); + } + } + } +} diff --git a/src/detection.rs b/src/detection.rs index c2d8de3..8a9f069 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -91,7 +91,10 @@ pub fn is_json_archive>(path: P) -> Result /// Create a buffered reader that handles decompression if needed. #[cfg(feature = "compression")] -fn create_reader(file: File, compression: CompressionFormat) -> Result, std::io::Error> { +fn create_reader( + file: File, + compression: CompressionFormat, +) -> Result, std::io::Error> { Ok(match compression { CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))), CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))), @@ -103,7 +106,10 @@ fn create_reader(file: File, compression: CompressionFormat) -> Result Result, std::io::Error> { +fn create_reader( + file: File, + compression: CompressionFormat, +) -> Result, std::io::Error> { if compression != CompressionFormat::None { // Without compression support, we can't decompress to check the header. // Return false by returning an empty reader that will fail header check. @@ -149,6 +155,19 @@ pub enum CompressionFormat { None, } +impl std::fmt::Display for CompressionFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CompressionFormat::Gzip => write!(f, "gzip"), + CompressionFormat::Deflate => write!(f, "deflate"), + CompressionFormat::Zlib => write!(f, "zlib"), + CompressionFormat::Brotli => write!(f, "brotli"), + CompressionFormat::Zstd => write!(f, "zstd"), + CompressionFormat::None => write!(f, "none"), + } + } +} + pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat { if bytes.len() < 4 { return CompressionFormat::None; @@ -160,12 +179,19 @@ pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat } // Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda - if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) { + if bytes[0] == 0x78 + && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) + { return CompressionFormat::Zlib; } // Zstd magic number: 0x28 0xb5 0x2f 0xfd - if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd { + if bytes.len() >= 4 + && bytes[0] == 0x28 + && bytes[1] == 0xb5 + && bytes[2] == 0x2f + && bytes[3] == 0xfd + { return CompressionFormat::Zstd; } diff --git a/src/diagnostics.rs b/src/diagnostics.rs index 42d7453..7a22be2 100644 --- a/src/diagnostics.rs +++ b/src/diagnostics.rs @@ -219,6 +219,12 @@ impl Diagnostic { } } +impl From for Vec { + fn from(diagnostic: Diagnostic) -> Self { + vec![diagnostic] + } +} + impl fmt::Display for Diagnostic { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let (Some(filename), Some(line)) = (&self.filename, self.line_number) { diff --git a/src/event_deserialize.rs b/src/event_deserialize.rs index c124e4a..8e33b2e 100644 --- a/src/event_deserialize.rs +++ b/src/event_deserialize.rs @@ -43,7 +43,7 @@ //! //! Spent 30 minutes looking for existing solutions. Checked: //! - serde_path_to_error: Adds field path context but still returns string errors -//! - figment: Configuration library, but sounded like could be used only for diagnostics +//! - figment: Configuration library, but sounded like could be used only for diagnostics //! - config/serde_value: Similar issue //! - json5: Relaxed JSON syntax, not diagnostic-focused //! - miette: a diagnostic library for Rust. It includes a series of @@ -63,10 +63,10 @@ //! diagnostics vec instead of returning errors. The calling code (reader.rs) attaches //! location information (filename, line number) after deserialization. +use chrono::{DateTime, Utc}; use serde::de::{Deserialize, Deserializer, SeqAccess, Visitor}; use serde_json::Value; use std::fmt; -use chrono::{DateTime, Utc}; use crate::diagnostics::{Diagnostic, DiagnosticCode, DiagnosticLevel}; use crate::events::Event; @@ -120,7 +120,7 @@ impl<'de> Visitor<'de> for EventVisitor { A: SeqAccess<'de>, { let mut elements: Vec = Vec::new(); - + while let Some(elem) = seq.next_element::()? { elements.push(elem); } @@ -140,7 +140,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the first element of an event to be a string event type.".to_string(), + "I expected the first element of an event to be a string event type." + .to_string(), ); return Ok(self.deserializer); } @@ -152,7 +153,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected an observe event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected an observe event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -176,7 +180,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the timestamp to be a valid ISO-8601 datetime string.".to_string(), + "I expected the timestamp to be a valid ISO-8601 datetime string." + .to_string(), ); return Ok(self.deserializer); } @@ -215,7 +220,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected an add event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected an add event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -258,7 +266,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a change event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a change event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -301,7 +312,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a remove event to have 3 fields, but found {}.", elements.len()), + format!( + "I expected a remove event to have 3 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -341,7 +355,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a move event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a move event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -394,7 +411,10 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldCount, - format!("I expected a snapshot event to have 4 fields, but found {}.", elements.len()), + format!( + "I expected a snapshot event to have 4 fields, but found {}.", + elements.len() + ), ); return Ok(self.deserializer); } @@ -418,7 +438,8 @@ impl<'de> Visitor<'de> for EventVisitor { self.deserializer.add_diagnostic( DiagnosticLevel::Fatal, DiagnosticCode::WrongFieldType, - "I expected the timestamp to be a valid ISO-8601 datetime string.".to_string(), + "I expected the timestamp to be a valid ISO-8601 datetime string." + .to_string(), ); return Ok(self.deserializer); } @@ -476,14 +497,18 @@ impl EventVisitor { let from_idx = match pair[0].as_u64() { Some(i) => i as usize, None => { - return Err("I expected the 'from' index to be a non-negative integer.".to_string()); + return Err( + "I expected the 'from' index to be a non-negative integer.".to_string() + ); } }; let to_idx = match pair[1].as_u64() { Some(i) => i as usize, None => { - return Err("I expected the 'to' index to be a non-negative integer.".to_string()); + return Err( + "I expected the 'to' index to be a non-negative integer.".to_string() + ); } }; @@ -503,7 +528,7 @@ mod tests { fn test_deserialize_observe_event() { let json = json!(["observe", "obs-1", "2025-01-01T00:00:00Z", 1]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -518,7 +543,7 @@ mod tests { fn test_deserialize_add_event() { let json = json!(["add", "/count", 42, "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -533,11 +558,14 @@ mod tests { fn test_deserialize_invalid_event_type() { let json = json!(["invalid", "some", "data"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert_eq!(deserializer.diagnostics.len(), 1); - assert_eq!(deserializer.diagnostics[0].code, DiagnosticCode::UnknownEventType); + assert_eq!( + deserializer.diagnostics[0].code, + DiagnosticCode::UnknownEventType + ); assert!(deserializer.event.is_none()); } @@ -545,11 +573,14 @@ mod tests { fn test_deserialize_wrong_field_count() { let json = json!(["observe", "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert_eq!(deserializer.diagnostics.len(), 1); - assert_eq!(deserializer.diagnostics[0].code, DiagnosticCode::WrongFieldCount); + assert_eq!( + deserializer.diagnostics[0].code, + DiagnosticCode::WrongFieldCount + ); assert!(deserializer.event.is_none()); } @@ -557,7 +588,7 @@ mod tests { fn test_deserialize_move_event() { let json = json!(["move", "/items", [[0, 2], [1, 0]], "obs-1"]); let result: Result = serde_json::from_value(json); - + assert!(result.is_ok()); let deserializer = result.unwrap(); assert!(deserializer.diagnostics.is_empty()); @@ -567,4 +598,4 @@ mod tests { if path == "/items" && moves == vec![(0, 2), (1, 0)] && observation_id == "obs-1" )); } -} \ No newline at end of file +} diff --git a/src/events.rs b/src/events.rs index da1ed45..3642c3d 100644 --- a/src/events.rs +++ b/src/events.rs @@ -89,7 +89,6 @@ pub enum Event { }, } - impl Serialize for Event { fn serialize(&self, serializer: S) -> Result where diff --git a/src/flags.rs b/src/flags.rs index 52382e3..d770c09 100644 --- a/src/flags.rs +++ b/src/flags.rs @@ -23,7 +23,7 @@ use std::path::PathBuf; xflags::xflags! { cmd json-archive { - default cmd create { + default cmd write { /// Input JSON files in chronological order. If first file is a .json.archive file, /// appends remaining files to it. Otherwise creates a new archive from all files. repeated inputs: PathBuf diff --git a/src/lib.rs b/src/lib.rs index 61234ec..a5b510e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,12 +19,11 @@ // marxism@peoplesgrocers.com // -pub mod archive_context; pub mod archive_open; -pub mod archive_ops; pub mod archive_reader; pub mod archive_writer; pub mod atomic_file; +pub mod compression_writer; pub mod detection; pub mod diagnostics; pub mod diff; @@ -33,12 +32,17 @@ pub mod events; pub mod flags; pub mod pointer; mod pointer_errors; +pub mod write_strategy; -pub use archive_writer::{ - append_to_archive, create_archive_from_files, default_output_filename, ArchiveBuilder, ArchiveWriter, +pub use archive_reader::{ + apply_add, apply_change, apply_move, apply_remove, read_archive, read_events, EventIterator, + ReadMode, ReadResult, }; +pub use archive_writer::{default_output_filename, write_observation, ArchiveWriter}; pub use detection::is_json_archive; pub use diagnostics::{Diagnostic, DiagnosticCode, DiagnosticCollector, DiagnosticLevel}; pub use events::{Event, Header, Observation}; pub use pointer::JsonPointer; -pub use archive_reader::{apply_add, apply_change, apply_move, apply_remove, ArchiveReader, ReadMode, ReadResult}; +pub use write_strategy::{ + compression_from_extension, determine_strategy, CompressedPath, WriteStrategy, +}; diff --git a/src/main.rs b/src/main.rs index 50f6807..ecbcfc9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,9 +19,7 @@ // marxism@peoplesgrocers.com // -use json_archive::archive_ops::{append_to_archive, create_archive, default_output_filename}; -use json_archive::{is_json_archive, Diagnostic, DiagnosticCode, DiagnosticLevel}; -use std::path::Path; +use json_archive::Diagnostic; use std::process; mod cmd; @@ -30,169 +28,22 @@ mod flags; fn main() { let flags = flags::JsonArchive::from_env_or_exit(); - let diagnostics = run(flags); + if let Err(diagnostics) = run(flags) { + for diagnostic in &diagnostics { + eprintln!("{}", diagnostic); + } - for diagnostic in &diagnostics { - eprintln!("{}", diagnostic); - } - - let has_fatal = diagnostics.iter().any(|d| d.is_fatal()); - if has_fatal { - process::exit(1); + let has_fatal = diagnostics.iter().any(|d| d.is_fatal()); + if has_fatal { + process::exit(1); + } } } -fn run(flags: flags::JsonArchive) -> Vec { +fn run(flags: flags::JsonArchive) -> Result<(), Vec> { match flags.subcommand { - flags::JsonArchiveCmd::Create(create_flags) => run_create(&create_flags), + flags::JsonArchiveCmd::Write(write_flags) => cmd::write::run(&write_flags), flags::JsonArchiveCmd::Info(info_flags) => cmd::info::run(&info_flags), flags::JsonArchiveCmd::State(state_flags) => cmd::state::run(&state_flags), } } - -struct ParsedCreateArgs { - destination: std::path::PathBuf, - input_files: Vec, -} - -/// Parse the create command arguments to determine the destination archive and input files. -/// This consolidates all the inferring behavior in one place. -fn parse_create_args(flags: &flags::Create) -> Result> { - if flags.inputs.is_empty() { - return Err(vec![Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::MissingHeaderField, - "I need at least one JSON file to create an archive, but you didn't provide any." - .to_string(), - ) - .with_advice( - "Usage: json-archive [file2.json ...]\n\n\ - The first file will be used as the initial state, and subsequent files \ - will be compared to generate change events." - .to_string(), - )]); - } - - // Determine the destination archive path - let destination = if let Some(output) = &flags.output { - // Explicitly specified output path - output.clone() - } else if Path::new(&flags.inputs[0]).exists() - && is_json_archive(&flags.inputs[0]).unwrap_or(false) - { - // First input is an existing archive - use it as destination - flags.inputs[0].clone() - } else { - // Infer from first input - default_output_filename(&flags.inputs[0]) - }; - - // Filter out the destination from input files to avoid read-write conflicts - let input_files: Vec<_> = flags.inputs - .iter() - .filter(|path| { - match (std::fs::canonicalize(path).ok(), std::fs::canonicalize(&destination).ok()) { - (Some(p), Some(d)) => p != d, - _ => true, // Include if canonicalization fails (file doesn't exist yet) - } - }) - .cloned() - .collect(); - - if input_files.is_empty() { - return Err(vec![ - Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::MissingHeaderField, - "No input files remain after filtering out the destination archive.".to_string() - ) - .with_advice( - "You specified the output path in the list of input files. This would cause a read-write conflict.\n\ - Either remove the output path from inputs, or use a different output path with -o." - .to_string() - ) - ]); - } - - // Validate all input files exist - let mut diagnostics = Vec::new(); - for input_path in &input_files { - if !Path::new(input_path).exists() { - diagnostics.push( - Diagnostic::new( - DiagnosticLevel::Fatal, - DiagnosticCode::PathNotFound, - format!("I couldn't find the input file: {}", input_path.display()), - ) - .with_advice( - "Make sure the file path is correct and the file exists. \ - Check for typos in the filename." - .to_string(), - ), - ); - } - } - - if !diagnostics.is_empty() { - return Err(diagnostics); - } - - Ok(ParsedCreateArgs { - destination, - input_files, - }) -} - -fn run_create(flags: &flags::Create) -> Vec { - let parsed = match parse_create_args(flags) { - Ok(parsed) => parsed, - Err(diagnostics) => return diagnostics, - }; - - if let Some(interval) = flags.snapshot_interval { - println!("Snapshot interval: every {} observations", interval); - } - - if let Some(ref source) = flags.source { - println!("Source: {}", source); - } - - // If destination exists and is an archive, append to it - if Path::new(&parsed.destination).exists() { - if let Ok(true) = is_json_archive(&parsed.destination) { - println!("Appending to existing archive: {}", parsed.destination.display()); - println!("Input files: {:?}", parsed.input_files); - - let diagnostics = append_to_archive( - &parsed.destination, - &parsed.input_files, - &parsed.destination, - flags.source.clone(), - flags.snapshot_interval, - ); - - if diagnostics.is_empty() { - println!("Archive updated successfully: {}", parsed.destination.display()); - } - - return diagnostics; - } - } - - // Otherwise create a new archive from the input files - println!("Creating new archive: {}", parsed.destination.display()); - println!("Input files: {:?}", parsed.input_files); - - let diagnostics = create_archive( - &parsed.input_files, - parsed.destination.clone(), - flags.source.clone(), - flags.snapshot_interval, - ); - - if diagnostics.is_empty() { - println!("Archive created successfully: {}", parsed.destination.display()); - } - - diagnostics -} diff --git a/src/pointer_errors.rs b/src/pointer_errors.rs index f447267..de7a1a1 100644 --- a/src/pointer_errors.rs +++ b/src/pointer_errors.rs @@ -390,7 +390,12 @@ mod tests { #[test] fn test_type_mismatch_error_output() { - let tokens = vec!["users".to_string(), "0".to_string(), "email".to_string(), "domain".to_string()]; + let tokens = vec![ + "users".to_string(), + "0".to_string(), + "email".to_string(), + "domain".to_string(), + ]; let current = Value::String("alice@example.com".to_string()); let diag = build_type_mismatch_error(&tokens, 3, "domain", ¤t); diff --git a/src/write_strategy.rs b/src/write_strategy.rs new file mode 100644 index 0000000..57b466d --- /dev/null +++ b/src/write_strategy.rs @@ -0,0 +1,352 @@ +// json-archive is a tool for tracking JSON file changes over time +// Copyright (C) 2025 Peoples Grocers LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// +// To purchase a license under different terms contact admin@peoplesgrocers.com +// To request changes, report bugs, or give user feedback contact +// marxism@peoplesgrocers.com +// + +//! Write strategy for archive operations. +//! +//! There are exactly two questions: +//! 1. Where do we write? (dest_path) +//! 2. Can we write there directly, or do we need to dance? +//! +//! The dance (temp file + atomic swap) is required when: +//! - source_path == dest_path, AND +//! - the file is compressed +//! +//! Why? Compressed streams don't support append. To add one +//! record to a gzip file, you decompress everything, add the +//! record, recompress everything. If you write to the same +//! file you're reading, you corrupt it mid-operation. +//! +//! So: write to temp, swap when done. See atomic_file.rs. +//! +//! When source != dest, there is no conflict. Read from source, +//! write to dest. Even if source is compressed. Even if dest +//! is compressed. Even if they use different compression. +//! The source is never modified. +//! +//! When source == dest AND uncompressed, just append. Seek to +//! end, write new records. Simple. +//! +//! The output compression format is determined by dest_path's +//! extension, not the source's format. That's a separate concern. +//! +//! ## Truth Table +//! +//! ```text +//! INPUTS OUTPUT FLAG STRATEGY +//! ─────────────────────────────────────────────────────────────── +//! [A.json, B.json] (none) Create { out: A.json.archive, fmt: None } +//! [A.json, B.json] -o X.archive.gz Create { out: X.archive.gz, fmt: Gzip } +//! +//! [A.archive, B.json] (none) Append { path: A.archive } +//! [A.archive, B.json] -o X.archive Direct { in: A.archive/None, out: X.archive/None } +//! +//! [A.archive.gz, B.json] (none) AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx } +//! [A.archive.gz, B.json] -o A.archive.gz AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx } +//! [A.archive.gz, B.json] -o X.archive Direct { in: A.archive.gz/Gzip, out: X.archive/None } +//! [A.archive.gz, B.json] -o X.archive.br Direct { in: A.archive.gz/Gzip, out: X.archive.br/Brotli } +//! ``` +//! +//! The rule: +//! ```text +//! if creating new archive: +//! Create +//! else if source != dest: +//! Direct (read from source, write to dest, transcoding as needed) +//! else if source == dest AND uncompressed: +//! Append (seek to end, write) +//! else if source == dest AND compressed: +//! AtomicSwap (read all, write to temp, swap) +//! ``` + +use std::path::{Path, PathBuf}; + +use crate::atomic_file::generate_temp_filename; +use crate::detection::CompressionFormat; + +/// A path with its compression format. +pub type CompressedPath = (PathBuf, CompressionFormat); + +/// Describes how to write archive data based on input/output paths and compression. +#[derive(Debug, Clone)] +pub enum WriteStrategy { + /// Create a new archive from scratch. No existing archive to read. + Create { output: CompressedPath }, + + /// Append to an existing uncompressed archive in-place. + /// Just seek to end and write new records. + Append { path: PathBuf }, + + /// Read from one location, write to another. + /// Handles transcoding between compression formats. + CopyOnWrite { + input: CompressedPath, + output: CompressedPath, + }, + + /// Read compressed archive, write to temp, atomic swap. + /// Required when source == dest AND compressed. + AtomicSwap { + /// The archive path (both input and output) + path: PathBuf, + /// Compression format (same for input and output in this case) + compression: CompressionFormat, + /// Temp file to write to before swapping + temp_path: PathBuf, + }, +} + +/// Determine compression format from file extension. +/// +/// Returns `CompressionFormat::None` for uncompressed files. +pub fn compression_from_extension(path: &Path) -> CompressionFormat { + let s = path.to_string_lossy(); + if s.ends_with(".gz") { + CompressionFormat::Gzip + } else if s.ends_with(".br") { + CompressionFormat::Brotli + } else if s.ends_with(".zst") { + CompressionFormat::Zstd + } else if s.ends_with(".zlib") { + CompressionFormat::Zlib + } else { + CompressionFormat::None + } +} + +/// Determine write strategy from parsed arguments. +/// +/// # Arguments +/// +/// * `source_archive` - Path to existing archive if appending, None if creating new +/// * `dest_path` - Where to write the output +/// * `source_compression` - Compression format of source (from magic bytes). Pass +/// `CompressionFormat::None` if unknown or uncompressed. +/// +/// # Returns +/// +/// The appropriate `WriteStrategy` for this operation. +pub fn determine_strategy( + source_archive: Option<&Path>, + dest_path: &Path, + source_compression: CompressionFormat, +) -> WriteStrategy { + let dest_compression = compression_from_extension(dest_path); + + // No source archive? Creating new. + let Some(source) = source_archive else { + return WriteStrategy::Create { + output: (dest_path.to_path_buf(), dest_compression), + }; + }; + + // Check if source and dest are the same file + let same_file = match (source.canonicalize(), dest_path.canonicalize()) { + (Ok(s), Ok(d)) => s == d, + // dest doesn't exist yet, or other error - not same file + _ => false, + }; + + if !same_file { + // Different files: read from source, write to dest + let source_fmt = if source_compression == CompressionFormat::None { + compression_from_extension(source) + } else { + source_compression + }; + return WriteStrategy::CopyOnWrite { + input: (source.to_path_buf(), source_fmt), + output: (dest_path.to_path_buf(), dest_compression), + }; + } + + // Same file - check if compressed + let compression = if source_compression == CompressionFormat::None { + compression_from_extension(source) + } else { + source_compression + }; + + match compression { + CompressionFormat::None => { + // Uncompressed: can append in-place + WriteStrategy::Append { + path: dest_path.to_path_buf(), + } + } + fmt => { + // Compressed: need atomic swap + WriteStrategy::AtomicSwap { + path: dest_path.to_path_buf(), + compression: fmt, + temp_path: generate_temp_filename(dest_path), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write as IoWrite; + use tempfile::NamedTempFile; + + #[test] + fn test_compression_from_extension() { + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.gz")), + CompressionFormat::Gzip + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.br")), + CompressionFormat::Brotli + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.zst")), + CompressionFormat::Zstd + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive.zlib")), + CompressionFormat::Zlib + ); + assert_eq!( + compression_from_extension(Path::new("foo.json.archive")), + CompressionFormat::None + ); + assert_eq!( + compression_from_extension(Path::new("foo.json")), + CompressionFormat::None + ); + } + + #[test] + fn test_create_new_archive() { + let dest = Path::new("/tmp/new.json.archive"); + let strategy = determine_strategy(None, dest, CompressionFormat::None); + + match strategy { + WriteStrategy::Create { output } => { + assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive")); + assert_eq!(output.1, CompressionFormat::None); + } + _ => panic!("Expected Create strategy"), + } + } + + #[test] + fn test_create_new_compressed_archive() { + let dest = Path::new("/tmp/new.json.archive.gz"); + let strategy = determine_strategy(None, dest, CompressionFormat::None); + + match strategy { + WriteStrategy::Create { output } => { + assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive.gz")); + assert_eq!(output.1, CompressionFormat::Gzip); + } + _ => panic!("Expected Create strategy"), + } + } + + #[test] + fn test_append_uncompressed_same_file() -> Result<(), Box> { + let mut temp = NamedTempFile::with_suffix(".json.archive")?; + writeln!(temp, "test")?; + temp.flush()?; + + let path = temp.path(); + let strategy = determine_strategy(Some(path), path, CompressionFormat::None); + + match strategy { + WriteStrategy::Append { path: p } => { + assert_eq!(p, path); + } + _ => panic!("Expected Append strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_atomic_swap_compressed_same_file() -> Result<(), Box> { + let mut temp = NamedTempFile::with_suffix(".json.archive.gz")?; + writeln!(temp, "test")?; + temp.flush()?; + + let path = temp.path(); + let strategy = determine_strategy(Some(path), path, CompressionFormat::Gzip); + + match strategy { + WriteStrategy::AtomicSwap { + path: p, + compression, + temp_path, + } => { + assert_eq!(p, path); + assert_eq!(compression, CompressionFormat::Gzip); + assert!(temp_path.to_string_lossy().contains(".json.archive.gz")); + } + _ => panic!("Expected AtomicSwap strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_direct_different_files() -> Result<(), Box> { + let mut source = NamedTempFile::with_suffix(".json.archive")?; + writeln!(source, "test")?; + source.flush()?; + + let dest = Path::new("/tmp/different.json.archive"); + let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::None); + + match strategy { + WriteStrategy::CopyOnWrite { input, output } => { + assert_eq!(input.0, source.path()); + assert_eq!(input.1, CompressionFormat::None); + assert_eq!(output.0, PathBuf::from("/tmp/different.json.archive")); + assert_eq!(output.1, CompressionFormat::None); + } + _ => panic!("Expected Direct strategy, got {:?}", strategy), + } + + Ok(()) + } + + #[test] + fn test_direct_transcode_compression() -> Result<(), Box> { + let mut source = NamedTempFile::with_suffix(".json.archive.gz")?; + writeln!(source, "test")?; + source.flush()?; + + let dest = Path::new("/tmp/output.json.archive.br"); + let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::Gzip); + + match strategy { + WriteStrategy::CopyOnWrite { input, output } => { + assert_eq!(input.1, CompressionFormat::Gzip); + assert_eq!(output.1, CompressionFormat::Brotli); + } + _ => panic!("Expected Direct strategy, got {:?}", strategy), + } + + Ok(()) + } +} diff --git a/tests/compressed_archive_tests.rs b/tests/compressed_archive_tests.rs index b7ea8d9..babcb7d 100644 --- a/tests/compressed_archive_tests.rs +++ b/tests/compressed_archive_tests.rs @@ -1,64 +1,94 @@ // Integration tests for compressed archive functionality -use json_archive::{append_to_archive, ArchiveWriter, Header}; -use json_archive::{ArchiveReader, ReadMode}; -use serde_json::json; -use std::io::Write; +use json_archive::archive_open::open_archive; +use json_archive::write_observation; +use json_archive::{read_archive, ReadMode}; +use serde_json::{json, Value}; +use std::fs::File; +use std::io::{BufWriter, Write}; use tempfile::NamedTempFile; #[test] #[cfg(feature = "compression")] -fn test_append_to_compressed_archive_basic() -> Result<(), Box> { +fn test_append_to_compressed_archive_basic() { use flate2::write::GzEncoder; use flate2::Compression; - // Create initial archive - let archive_file = NamedTempFile::with_suffix(".json.archive")?; - let header = Header::new(json!({"count": 0}), Some("test".to_string())); + // Create initial archive with one state + let initial_state = create_json_file(&json!({"count": 0})); + let archive_file = NamedTempFile::with_suffix(".json.archive").unwrap(); + #[allow(unused_assignments)] { - let mut writer = ArchiveWriter::new(archive_file.path(), None) - .map_err(|e| format!("Failed to create writer: {:?}", e))?; - writer.write_header(&header) - .map_err(|e| format!("Failed to write header: {:?}", e))?; - writer.finish() - .map_err(|e| format!("Failed to finish: {:?}", e))?; + let file = File::create(archive_file.path()).unwrap(); + let mut writer = BufWriter::new(file); + let mut current_state = Value::Null; + let mut observation_count: usize = 0; + + current_state = write_observation( + &mut writer, + &mut observation_count, + None, + ¤t_state, + &initial_state.path().to_path_buf(), + Some("test".to_string()), + ) + .unwrap(); + + writer.flush().unwrap(); } + dump_file(archive_file.path(), "Uncompressed archive"); + // Compress it - let compressed_file = NamedTempFile::with_suffix(".json.archive.gz")?; + let compressed_file = NamedTempFile::with_suffix(".json.archive.gz").unwrap(); { - let input = std::fs::read(archive_file.path())?; + let input = std::fs::read(archive_file.path()).unwrap(); let mut encoder = GzEncoder::new( - compressed_file.as_file().try_clone()?, - Compression::default() + compressed_file.as_file().try_clone().unwrap(), + Compression::default(), ); - encoder.write_all(&input)?; - encoder.finish()?; + encoder.write_all(&input).unwrap(); + encoder.finish().unwrap(); } - // Create a new state file to append - let mut state_file = NamedTempFile::new()?; - writeln!(state_file, r#"{{"count": 1}}"#)?; - state_file.flush()?; + dump_file(compressed_file.path(), "Compressed archive"); - // Append to compressed archive - let diagnostics = append_to_archive( - compressed_file.path(), - &[state_file.path()], - compressed_file.path(), - None, - None, - ); + // Verify the compressed archive can be read + let opened = open_archive(compressed_file.path()).unwrap(); + let result = read_archive( + opened.reader, + &compressed_file.path().display().to_string(), + ReadMode::FullValidation, + ) + .unwrap(); - // Should succeed with no diagnostics - assert!(diagnostics.is_empty(), "Got diagnostics: {:?}", diagnostics); + eprintln!("=== Reader result ==="); + eprintln!("final_state: {:?}", result.final_state); + eprintln!("observation_count: {}", result.observation_count); + eprintln!("diagnostics: {:?}", result.diagnostics); + eprintln!(); - // Verify the archive was updated (decompressed) - let reader = ArchiveReader::new(compressed_file.path(), ReadMode::FullValidation)?; - let result = reader.read(compressed_file.path())?; - assert_eq!(result.final_state, json!({"count": 1})); - assert_eq!(result.observation_count, 1); - - Ok(()) + assert_eq!(result.final_state, json!({"count": 0})); + assert_eq!(result.observation_count, 0); +} + +/// Helper to create a temp file with JSON content +fn create_json_file(content: &Value) -> NamedTempFile { + let mut file = NamedTempFile::new().expect("Failed to create temp file"); + writeln!(file, "{}", serde_json::to_string(content).unwrap()).unwrap(); + file +} + +/// Debug helper: print file contents as both hex and text +fn dump_file(path: &std::path::Path, label: &str) { + let bytes = std::fs::read(path).unwrap(); + eprintln!("=== {} ({} bytes) ===", label, bytes.len()); + eprintln!("Hex: {:02x?}", &bytes[..bytes.len().min(100)]); + if let Ok(text) = std::str::from_utf8(&bytes) { + eprintln!("Text:\n{}", &text[..text.len().min(500)]); + } else { + eprintln!("(not valid UTF-8)"); + } + eprintln!(); } diff --git a/tests/compression-integration/run_brotli_test.sh b/tests/compression-integration/run_brotli_test.sh index c29185a..81d1748 100755 --- a/tests/compression-integration/run_brotli_test.sh +++ b/tests/compression-integration/run_brotli_test.sh @@ -44,11 +44,11 @@ echo "Final archive info:" # Decompress for manual inspection echo "" echo "Decompressing for comparison..." -brotli -d -k "$OUT_DIR/test.json.archive.br" +brotli -d -k "$OUT_DIR/test.json.archive.br" -o "$OUT_DIR/test-decompressed.json.archive" echo "" echo "Decompressed archive info:" -"$BINARY" info "$OUT_DIR/test.json.archive" +"$BINARY" info "$OUT_DIR/test-decompressed.json.archive" echo "" echo "Files in $OUT_DIR:"