80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Utility-Script zum Inspizieren von gzip-komprimierten JSON-Dateien.
|
||
|
|
Verarbeitet Dateien streaming, ohne alles in den RAM zu laden.
|
||
|
|
|
||
|
|
Verwendung:
|
||
|
|
python scripts/inspect_gzip.py <datei.json.gz> [--limit N] [--output datei.json]
|
||
|
|
"""
|
||
|
|
import gzip
|
||
|
|
import json
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
def inspect_gzip_file(filepath: str, limit: int = None, output_file: str = None):
|
||
|
|
"""
|
||
|
|
Liest eine gzip-komprimierte NDJSON-Datei und gibt die Inhalte aus.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
filepath: Pfad zur .json.gz Datei
|
||
|
|
limit: Maximale Anzahl der auszugebenden Records (None = alle)
|
||
|
|
output_file: Optional: Ausgabe in Datei statt stdout
|
||
|
|
"""
|
||
|
|
path = Path(filepath)
|
||
|
|
if not path.exists():
|
||
|
|
print(f"Fehler: Datei '{filepath}' nicht gefunden.", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
count = 0
|
||
|
|
output = open(output_file, 'w', encoding='utf-8') if output_file else sys.stdout
|
||
|
|
|
||
|
|
try:
|
||
|
|
with gzip.open(filepath, mode='rt', encoding='utf-8') as f:
|
||
|
|
for line in f:
|
||
|
|
if not line.strip():
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
record = json.loads(line)
|
||
|
|
# Pretty-print einzelner Record
|
||
|
|
json.dump(record, output, indent=2, ensure_ascii=False)
|
||
|
|
output.write('\n')
|
||
|
|
count += 1
|
||
|
|
|
||
|
|
if limit and count >= limit:
|
||
|
|
break
|
||
|
|
|
||
|
|
except json.JSONDecodeError as e:
|
||
|
|
print(f"JSON-Fehler in Zeile {count + 1}: {e}", file=sys.stderr)
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"\n--- {count} Records verarbeitet ---", file=sys.stderr)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
if output_file and output != sys.stdout:
|
||
|
|
output.close()
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description='Inspiziert gzip-komprimierte JSON-Dateien (NDJSON-Format)'
|
||
|
|
)
|
||
|
|
parser.add_argument('file', help='Pfad zur .json.gz Datei')
|
||
|
|
parser.add_argument('--limit', '-n', type=int, default=10,
|
||
|
|
help='Maximale Anzahl der Records (default: 10, 0 = alle)')
|
||
|
|
parser.add_argument('--output', '-o', type=str,
|
||
|
|
help='Ausgabe in Datei statt stdout')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
limit = args.limit if args.limit > 0 else None
|
||
|
|
return inspect_gzip_file(args.file, limit=limit, output_file=args.output)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
sys.exit(main())
|