feat: transparently append to compressed archives

When appending to a compressed archive (gzip, brotli, zstd), the tool
now handles compression automatically. Since some compression formats don't
support appending to compressed files in place, we write a new
compressed file with all the data and atomically rename it to replace
the original (assuming there is enough space on that filesystem).

This means you can work with compressed archives the same way as
uncompressed ones. Point the tool at your .json.gz file and append
values. No manual decompression/recompression needed.
This commit is contained in:
nobody 2025-11-30 17:09:44 -08:00
commit 2ab1c31993
Signed by: GrocerPublishAgent
GPG key ID: 43B1C298CDDE181C
34 changed files with 4747 additions and 1099 deletions

View file

@ -0,0 +1,64 @@
// Integration tests for compressed archive functionality
use json_archive::{append_to_archive, ArchiveWriter, Header};
use json_archive::{ArchiveReader, ReadMode};
use serde_json::json;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
#[cfg(feature = "compression")]
fn test_append_to_compressed_archive_basic() -> Result<(), Box<dyn std::error::Error>> {
use flate2::write::GzEncoder;
use flate2::Compression;
// Create initial archive
let archive_file = NamedTempFile::with_suffix(".json.archive")?;
let header = Header::new(json!({"count": 0}), Some("test".to_string()));
{
let mut writer = ArchiveWriter::new(archive_file.path(), None)
.map_err(|e| format!("Failed to create writer: {:?}", e))?;
writer.write_header(&header)
.map_err(|e| format!("Failed to write header: {:?}", e))?;
writer.finish()
.map_err(|e| format!("Failed to finish: {:?}", e))?;
}
// Compress it
let compressed_file = NamedTempFile::with_suffix(".json.archive.gz")?;
{
let input = std::fs::read(archive_file.path())?;
let mut encoder = GzEncoder::new(
compressed_file.as_file().try_clone()?,
Compression::default()
);
encoder.write_all(&input)?;
encoder.finish()?;
}
// Create a new state file to append
let mut state_file = NamedTempFile::new()?;
writeln!(state_file, r#"{{"count": 1}}"#)?;
state_file.flush()?;
// Append to compressed archive
let diagnostics = append_to_archive(
compressed_file.path(),
&[state_file.path()],
compressed_file.path(),
None,
None,
);
// Should succeed with no diagnostics
assert!(diagnostics.is_empty(), "Got diagnostics: {:?}", diagnostics);
// Verify the archive was updated (decompressed)
let reader = ArchiveReader::new(compressed_file.path(), ReadMode::FullValidation)?;
let result = reader.read(compressed_file.path())?;
assert_eq!(result.final_state, json!({"count": 1}));
assert_eq!(result.observation_count, 1);
Ok(())
}

View file

@ -0,0 +1,78 @@
# Compression Integration Tests
Manual integration tests for compressed archive functionality.
These scripts exercise the tool's ability to:
1. Read archives that were compressed by external programs (gzip, brotli, zstd)
2. Append new observations to compressed archives
3. Produce correct results whether reading compressed or uncompressed
## Scripts
### `generate_state.py <n>`
Generates a JSON state file with `n` items in each array. Output goes to stdout.
```bash
./generate_state.py 3
# Output: {"colors":["color_1","color_2","color_3"],"numbers":["number_1","number_2","number_3"],"animals":["animal_1","animal_2","animal_3"]}
```
### `generate_state_files.py <count> <output_dir>`
Generates a series of state files (state_1.json through state_N.json) with progressively more items.
```bash
./generate_state_files.py 9 ./data
# Creates: data/state_1.json, data/state_2.json, ... data/state_9.json
```
### `run_gzip_test.sh`
Tests the gzip compression workflow:
1. Create archive from first state file
2. Compress with gzip
3. Append remaining 8 state files to the compressed archive
4. Decompress and inspect
### `run_brotli_test.sh`
Same workflow but with brotli compression.
### `run_zstd_test.sh`
Same workflow but with zstd compression.
### `run_all.sh`
Runs all compression tests in sequence.
### `validate.sh` (optional)
Smoke test to verify the final state matches expectations.
## Usage
```bash
cd tests/compression-integration
# Run all tests (generates data, builds, runs all compression formats)
./run_all.sh
# Or run individual steps:
./generate_state_files.py 9 ./data
./run_gzip_test.sh
./run_brotli_test.sh
./run_zstd_test.sh
# Optional: validate outputs match
./validate.sh
```
## What to look for
After running the tests, you can manually verify:
1. The compressed archives were created
2. Appending to compressed archives worked (check file sizes grew)
3. The `info` command shows the same observation count for compressed and decompressed versions
4. The `state` command returns the same final state
## Dependencies
- gzip (usually pre-installed)
- brotli (`brew install brotli`)
- zstd (`brew install zstd`)

View file

@ -0,0 +1,28 @@
#!/usr/bin/env python3
"""
Generate a JSON state file with N items in each array.
Output goes to stdout.
Usage: ./generate_state.py <n>
"""
import json
import sys
def main():
if len(sys.argv) != 2:
print("Usage: generate_state.py <n>", file=sys.stderr)
sys.exit(1)
n = int(sys.argv[1])
state = {
"colors": [f"color_{i}" for i in range(1, n + 1)],
"numbers": [f"number_{i}" for i in range(1, n + 1)],
"animals": [f"animal_{i}" for i in range(1, n + 1)],
}
print(json.dumps(state))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""
Generate a series of state files with progressively more items.
Usage: ./generate_state_files.py <count> <output_dir>
Creates: output_dir/state_1.json, state_2.json, ..., state_N.json
"""
import json
import os
import sys
def generate_state(n):
return {
"colors": [f"color_{i}" for i in range(1, n + 1)],
"numbers": [f"number_{i}" for i in range(1, n + 1)],
"animals": [f"animal_{i}" for i in range(1, n + 1)],
}
def main():
if len(sys.argv) != 3:
print("Usage: generate_state_files.py <count> <output_dir>", file=sys.stderr)
sys.exit(1)
count = int(sys.argv[1])
output_dir = sys.argv[2]
os.makedirs(output_dir, exist_ok=True)
for i in range(1, count + 1):
state = generate_state(i)
path = os.path.join(output_dir, f"state_{i}.json")
with open(path, "w") as f:
json.dump(state, f)
print(f"Created {path}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,33 @@
#!/usr/bin/env bash
#
# Run all compression integration tests.
#
# Usage: ./run_all.sh
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
echo "=== Building json-archive with compression support ==="
cd "$PROJECT_DIR"
cargo build --features compression
echo ""
echo "=== Generating test data ==="
cd "$SCRIPT_DIR"
python3 generate_state_files.py 9 ./data
echo ""
"$SCRIPT_DIR/run_gzip_test.sh"
echo ""
"$SCRIPT_DIR/run_brotli_test.sh"
echo ""
"$SCRIPT_DIR/run_zstd_test.sh"
echo ""
echo "=== All tests complete ==="
echo "Output files are in: $SCRIPT_DIR/out/"

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
#
# Test brotli compression workflow:
# 1. Create archive from first state file
# 2. Compress with brotli
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/brotli"
echo "=== Brotli Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with brotli
echo "Compressing with brotli..."
brotli "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.br" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.br"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
brotli -d -k "$OUT_DIR/test.json.archive.br"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
#
# Test gzip compression workflow:
# 1. Create archive from first state file
# 2. Compress with gzip
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/gzip"
echo "=== Gzip Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with gzip
echo "Compressing with gzip..."
gzip "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.gz" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.gz"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
gunzip -k "$OUT_DIR/test.json.archive.gz" 2>/dev/null || gunzip -c "$OUT_DIR/test.json.archive.gz" > "$OUT_DIR/test.json.archive"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,56 @@
#!/usr/bin/env bash
#
# Test zstd compression workflow:
# 1. Create archive from first state file
# 2. Compress with zstd
# 3. Append remaining state files to the compressed archive
# 4. Decompress and show info
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
DATA_DIR="$SCRIPT_DIR/data"
OUT_DIR="$SCRIPT_DIR/out/zstd"
echo "=== Zstd Compression Test ==="
# Setup
rm -rf "$OUT_DIR"
mkdir -p "$OUT_DIR"
# Create initial archive from first state file
echo "Creating archive from state_1.json..."
"$BINARY" "$DATA_DIR/state_1.json" -o "$OUT_DIR/test.json.archive"
# Compress with zstd
echo "Compressing with zstd..."
zstd "$OUT_DIR/test.json.archive"
rm "$OUT_DIR/test.json.archive"
ls -la "$OUT_DIR/"
# Append remaining files to compressed archive
for i in $(seq 2 9); do
echo "Appending state_$i.json to compressed archive..."
"$BINARY" "$OUT_DIR/test.json.archive.zst" "$DATA_DIR/state_$i.json"
done
# Show info on the result
echo ""
echo "Final archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive.zst"
# Decompress for manual inspection
echo ""
echo "Decompressing for comparison..."
zstd -d -k "$OUT_DIR/test.json.archive.zst"
echo ""
echo "Decompressed archive info:"
"$BINARY" info "$OUT_DIR/test.json.archive"
echo ""
echo "Files in $OUT_DIR:"
ls -la "$OUT_DIR/"

View file

@ -0,0 +1,63 @@
#!/usr/bin/env bash
#
# Validate that compressed and decompressed archives produce the same results.
# Run this after run_all.sh to smoke test the outputs.
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BINARY="$PROJECT_DIR/target/debug/json-archive"
echo "=== Validation ==="
errors=0
for format in gzip brotli zstd; do
dir="$SCRIPT_DIR/out/$format"
if [ ! -d "$dir" ]; then
echo "SKIP: $format (no output directory)"
continue
fi
# Find the compressed and uncompressed files
compressed=$(find "$dir" -name "*.gz" -o -name "*.br" -o -name "*.zst" | head -1)
uncompressed="$dir/test.json.archive"
if [ ! -f "$compressed" ] || [ ! -f "$uncompressed" ]; then
echo "SKIP: $format (missing files)"
continue
fi
# Compare state output
state_compressed=$("$BINARY" state "$compressed")
state_uncompressed=$("$BINARY" state "$uncompressed")
if [ "$state_compressed" = "$state_uncompressed" ]; then
echo "OK: $format - state matches"
else
echo "FAIL: $format - state differs"
errors=$((errors + 1))
fi
# Compare observation count from info
count_compressed=$("$BINARY" info "$compressed" --output json | python3 -c "import sys,json; print(json.load(sys.stdin)['observation_count'])")
count_uncompressed=$("$BINARY" info "$uncompressed" --output json | python3 -c "import sys,json; print(json.load(sys.stdin)['observation_count'])")
if [ "$count_compressed" = "$count_uncompressed" ]; then
echo "OK: $format - observation count matches ($count_compressed)"
else
echo "FAIL: $format - observation count differs ($count_compressed vs $count_uncompressed)"
errors=$((errors + 1))
fi
done
echo ""
if [ $errors -eq 0 ]; then
echo "All validations passed."
else
echo "$errors validation(s) failed."
exit 1
fi