Refactor: Code-Qualität verbessert und Projektstruktur aufgeräumt
Some checks failed
Deployment / deploy-docker (push) Has been cancelled

- daemon.py: gc.collect() entfernt, robustes Scheduling (last_run_date statt Minuten-Check),
  Exchange Registry Pattern eingeführt (STREAMING_EXCHANGES/STANDARD_EXCHANGES)
- deutsche_boerse.py: Thread-safe User-Agent Rotation bei Rate-Limits,
  Logging statt print(), Feiertags-Prüfung, aufgeteilte Parse-Methoden
- eix.py: Logging statt print(), spezifische Exception-Typen statt blankem except
- read.py gelöscht und durch scripts/inspect_gzip.py ersetzt (Streaming-basiert)
- Utility-Scripts in scripts/ verschoben (cleanup_duplicates, restore_and_fix, verify_fix)
This commit is contained in:
Melchior Reimers
2026-02-01 08:18:55 +01:00
parent cf55a0bd06
commit 1dc79b8b64
9 changed files with 545 additions and 308 deletions

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
Script zum Entfernen von duplizierten Trades aus QuestDB.
Erstellt eine neue Tabelle ohne Duplikate und ersetzt die alte.
"""
import requests
import os
import sys
DB_HOST = os.getenv("QUESTDB_HOST", "localhost")
DB_PORT = os.getenv("QUESTDB_PORT", "9000")
DB_USER = os.getenv("DB_USER", "admin")
DB_PASSWORD = os.getenv("DB_PASSWORD", "quest")
DB_URL = f"http://{DB_HOST}:{DB_PORT}"
DB_AUTH = (DB_USER, DB_PASSWORD) if DB_USER and DB_PASSWORD else None
def execute_query(query, timeout=300):
"""Führt eine QuestDB Query aus."""
try:
response = requests.get(
f"{DB_URL}/exec",
params={'query': query},
auth=DB_AUTH,
timeout=timeout
)
if response.status_code == 200:
return response.json()
else:
print(f"Query failed: {response.text}")
return None
except Exception as e:
print(f"Error executing query: {e}")
return None
def get_table_count(table_name):
"""Zählt Einträge in einer Tabelle."""
result = execute_query(f"SELECT count(*) FROM {table_name}")
if result and result.get('dataset'):
return result['dataset'][0][0]
return 0
def main():
print("=" * 60)
print("QuestDB Duplikat-Bereinigung")
print("=" * 60)
# 1. Prüfe aktuelle Anzahl
original_count = get_table_count("trades")
print(f"\n1. Aktuelle Anzahl Trades: {original_count:,}")
if original_count == 0:
print("Keine Trades in der Datenbank. Nichts zu tun.")
return
# 2. Analysiere Duplikate pro Exchange
print("\n2. Analysiere Duplikate pro Exchange...")
analysis_query = """
SELECT
exchange,
count(*) as total,
count(distinct concat(isin, '-', cast(timestamp as string), '-', cast(price as string), '-', cast(quantity as string))) as unique_trades
FROM trades
GROUP BY exchange
ORDER BY exchange
"""
result = execute_query(analysis_query)
if result and result.get('dataset'):
print(f"\n{'Exchange':<15} {'Total':>12} {'Unique':>12} {'Duplicates':>12}")
print("-" * 55)
total_all = 0
unique_all = 0
for row in result['dataset']:
exchange, total, unique = row
duplicates = total - unique
total_all += total
unique_all += unique
print(f"{exchange:<15} {total:>12,} {unique:>12,} {duplicates:>12,}")
print("-" * 55)
print(f"{'TOTAL':<15} {total_all:>12,} {unique_all:>12,} {total_all - unique_all:>12,}")
# 3. Erstelle bereinigte Tabelle
print("\n3. Erstelle bereinigte Tabelle 'trades_clean'...")
print(" HINWEIS: Bei großen Datenmengen kann dies mehrere Minuten dauern...")
# Lösche alte clean-Tabelle falls vorhanden
execute_query("DROP TABLE IF EXISTS trades_clean")
# QuestDB: SAMPLE BY 1T mit LATEST ON für Deduplizierung
# Das gruppiert nach Timestamp (auf Nanosekunde genau) und behält nur den letzten Eintrag
# Alternative: Wir verwenden GROUP BY mit MIN/MAX
# Erst die Tabelle erstellen
create_table_query = """
CREATE TABLE trades_clean (
exchange SYMBOL,
symbol SYMBOL,
isin SYMBOL,
price DOUBLE,
quantity DOUBLE,
timestamp TIMESTAMP
) TIMESTAMP(timestamp) PARTITION BY DAY WAL DEDUP UPSERT KEYS(timestamp, exchange, isin, price, quantity)
"""
result = execute_query(create_table_query, timeout=60)
if result is None:
print(" Fehler beim Erstellen der Tabellenstruktur!")
# Fallback: Ohne DEDUP
create_table_query = """
CREATE TABLE trades_clean (
exchange SYMBOL,
symbol SYMBOL,
isin SYMBOL,
price DOUBLE,
quantity DOUBLE,
timestamp TIMESTAMP
) TIMESTAMP(timestamp) PARTITION BY DAY WAL
"""
execute_query(create_table_query, timeout=60)
# Dann Daten einfügen mit INSERT ... SELECT (ohne LIMIT!)
print(" Kopiere Daten (ohne Duplikate)...")
insert_query = """
INSERT INTO trades_clean
SELECT exchange, symbol, isin, price, quantity, timestamp
FROM (
SELECT exchange, symbol, isin, price, quantity, timestamp,
row_number() OVER (PARTITION BY exchange, isin, timestamp, price, quantity ORDER BY timestamp) as rn
FROM trades
)
WHERE rn = 1
"""
result = execute_query(insert_query, timeout=3600) # 1 Stunde Timeout
if result is None:
print(" Fehler bei INSERT - versuche alternative Methode...")
# Fallback: Direkte Kopie ohne Deduplizierung über SQL
# Stattdessen per ILP deduplizieren
insert_simple = "INSERT INTO trades_clean SELECT * FROM trades"
execute_query(insert_simple, timeout=3600)
clean_count = get_table_count("trades_clean")
print(f" Bereinigte Tabelle: {clean_count:,} Trades")
if clean_count == 0:
print(" FEHLER: Keine Daten kopiert!")
return
removed = original_count - clean_count
if removed > 0:
print(f" Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
else:
print(" Keine Duplikate durch SQL entfernt (DEDUP wird bei neuen Inserts aktiv)")
# 4. Ersetze alte Tabelle
print("\n4. Ersetze alte Tabelle...")
# Rename alte Tabelle zu backup
execute_query("RENAME TABLE trades TO trades_backup")
# Rename neue Tabelle zu trades
execute_query("RENAME TABLE trades_clean TO trades")
# Verifiziere
final_count = get_table_count("trades")
print(f" Neue Trades-Tabelle: {final_count:,} Einträge")
# 5. Lösche Backup (optional)
print("\n5. Lösche Backup-Tabelle...")
execute_query("DROP TABLE IF EXISTS trades_backup")
print(" Backup gelöscht.")
# 6. Zusammenfassung
print("\n" + "=" * 60)
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f"Vorher: {original_count:>15,} Trades")
print(f"Nachher: {final_count:>15,} Trades")
print(f"Entfernt:{removed:>15,} Duplikate ({removed/original_count*100:.1f}%)")
print("=" * 60)
# 7. Statistik-Tabellen neu berechnen
print("\n6. Lösche alte Analytics-Tabellen (werden neu berechnet)...")
for table in ['analytics_daily_summary', 'analytics_exchange_daily',
'analytics_stock_trends', 'analytics_volume_changes', 'analytics_custom']:
result = execute_query(f"DROP TABLE IF EXISTS {table}")
print(f" {table} gelöscht")
print("\nFertig! Der Analytics Worker wird die Statistiken beim nächsten Start neu berechnen.")
if __name__ == "__main__":
main()

79
scripts/inspect_gzip.py Normal file
View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
"""
Utility-Script zum Inspizieren von gzip-komprimierten JSON-Dateien.
Verarbeitet Dateien streaming, ohne alles in den RAM zu laden.
Verwendung:
python scripts/inspect_gzip.py <datei.json.gz> [--limit N] [--output datei.json]
"""
import gzip
import json
import argparse
import sys
from pathlib import Path
def inspect_gzip_file(filepath: str, limit: int = None, output_file: str = None):
"""
Liest eine gzip-komprimierte NDJSON-Datei und gibt die Inhalte aus.
Args:
filepath: Pfad zur .json.gz Datei
limit: Maximale Anzahl der auszugebenden Records (None = alle)
output_file: Optional: Ausgabe in Datei statt stdout
"""
path = Path(filepath)
if not path.exists():
print(f"Fehler: Datei '{filepath}' nicht gefunden.", file=sys.stderr)
return 1
count = 0
output = open(output_file, 'w', encoding='utf-8') if output_file else sys.stdout
try:
with gzip.open(filepath, mode='rt', encoding='utf-8') as f:
for line in f:
if not line.strip():
continue
try:
record = json.loads(line)
# Pretty-print einzelner Record
json.dump(record, output, indent=2, ensure_ascii=False)
output.write('\n')
count += 1
if limit and count >= limit:
break
except json.JSONDecodeError as e:
print(f"JSON-Fehler in Zeile {count + 1}: {e}", file=sys.stderr)
continue
print(f"\n--- {count} Records verarbeitet ---", file=sys.stderr)
finally:
if output_file and output != sys.stdout:
output.close()
return 0
def main():
parser = argparse.ArgumentParser(
description='Inspiziert gzip-komprimierte JSON-Dateien (NDJSON-Format)'
)
parser.add_argument('file', help='Pfad zur .json.gz Datei')
parser.add_argument('--limit', '-n', type=int, default=10,
help='Maximale Anzahl der Records (default: 10, 0 = alle)')
parser.add_argument('--output', '-o', type=str,
help='Ausgabe in Datei statt stdout')
args = parser.parse_args()
limit = args.limit if args.limit > 0 else None
return inspect_gzip_file(args.file, limit=limit, output_file=args.output)
if __name__ == '__main__':
sys.exit(main())

125
scripts/restore_and_fix.py Normal file
View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python3
"""
Script zum Wiederherstellen und korrekten Bereinigen der Trades.
"""
import requests
import os
import sys
DB_HOST = os.getenv("QUESTDB_HOST", "localhost")
DB_PORT = os.getenv("QUESTDB_PORT", "9000")
DB_USER = os.getenv("DB_USER", "admin")
DB_PASSWORD = os.getenv("DB_PASSWORD", "quest")
DB_URL = f"http://{DB_HOST}:{DB_PORT}"
DB_AUTH = (DB_USER, DB_PASSWORD) if DB_USER and DB_PASSWORD else None
def execute_query(query, timeout=300):
"""Führt eine QuestDB Query aus."""
try:
response = requests.get(
f"{DB_URL}/exec",
params={'query': query, 'count': 'true'},
auth=DB_AUTH,
timeout=timeout
)
if response.status_code == 200:
return response.json()
else:
print(f"Query failed: {response.text[:500]}")
return None
except Exception as e:
print(f"Error executing query: {e}")
return None
def get_table_count(table_name):
"""Zählt Einträge in einer Tabelle."""
result = execute_query(f"SELECT count(*) FROM {table_name}")
if result and result.get('dataset'):
return result['dataset'][0][0]
return 0
def table_exists(table_name):
"""Prüft ob eine Tabelle existiert."""
result = execute_query(f"SELECT count(*) FROM {table_name} LIMIT 1")
return result is not None
def main():
print("=" * 60)
print("QuestDB Daten-Wiederherstellung und Bereinigung")
print("=" * 60)
# 1. Prüfe aktuellen Stand
current_count = get_table_count("trades")
print(f"\n1. Aktuelle Trades-Tabelle: {current_count:,} Einträge")
# 2. Prüfe ob Backup existiert
backup_exists = table_exists("trades_backup")
if backup_exists:
backup_count = get_table_count("trades_backup")
print(f" Backup-Tabelle gefunden: {backup_count:,} Einträge")
if backup_count > current_count:
print("\n2. Backup hat mehr Daten - Wiederherstellung möglich!")
response = input(" Backup wiederherstellen? (j/n): ")
if response.lower() == 'j':
print(" Lösche aktuelle Tabelle...")
execute_query("DROP TABLE trades")
print(" Benenne Backup um...")
execute_query("RENAME TABLE trades_backup TO trades")
new_count = get_table_count("trades")
print(f" Wiederhergestellt: {new_count:,} Trades")
else:
print(" Backup hat weniger/gleich viele Daten - keine Wiederherstellung nötig")
else:
print(" Kein Backup gefunden!")
# 3. Zeige Statistik pro Exchange
print("\n3. Trades pro Exchange:")
result = execute_query("""
SELECT exchange, count(*) as cnt
FROM trades
GROUP BY exchange
ORDER BY cnt DESC
""")
if result and result.get('dataset'):
for row in result['dataset']:
print(f" {row[0]}: {row[1]:,}")
# 4. Aktiviere DEDUP für zukünftige Inserts
print("\n4. Prüfe DEDUP-Status...")
# QuestDB: DEDUP kann nur bei Tabellenerstellung gesetzt werden
# Wir können aber eine neue Tabelle mit DEDUP erstellen
print("\n5. Empfehlung:")
print(" - Die Deduplizierung sollte im daemon.py erfolgen (bereits implementiert)")
print(" - Der Hash-basierte Check verhindert zukünftige Duplikate")
print(" - Für bestehende Duplikate: Manuelles Cleanup in Batches")
# 6. Zeige Duplikat-Analyse für eine Exchange
print("\n6. Stichproben-Analyse für Duplikate...")
result = execute_query("""
SELECT exchange, isin, timestamp, price, quantity, count(*) as cnt
FROM trades
WHERE exchange = 'EIX'
GROUP BY exchange, isin, timestamp, price, quantity
HAVING count(*) > 1
LIMIT 10
""", timeout=120)
if result and result.get('dataset') and len(result['dataset']) > 0:
print(" Gefundene Duplikate (Beispiele):")
for row in result['dataset'][:5]:
print(f" {row[0]} | {row[1]} | {row[2]} | {row[3]} | {row[4]} | {row[5]}x")
else:
print(" Keine Duplikate in EIX gefunden (oder Query timeout)")
print("\n" + "=" * 60)
print("Fertig!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
"""Test script to verify the improved sector metadata fetching"""
import sys
sys.path.insert(0, '/Users/melchiorreimers/.gemini/antigravity/scratch/trading_daemon/src')
from metadata.fetcher import fetch_ticker_from_openfigi, fetch_sector_from_yfinance, fetch_metadata
# Test ISINs from different regions and sectors
test_isins = [
'US69553P1003', # PagerDuty (US, Technology)
'DE000LS1LUS9', # German security
'US0378331005', # Apple (US, Technology)
]
print("Testing improved sector metadata fetching...\n")
for isin in test_isins:
print(f"Testing ISIN: {isin}")
# Test ticker lookup
ticker = fetch_ticker_from_openfigi(isin)
print(f" Ticker: {ticker}")
# Test sector lookup if ticker found
if ticker:
sector = fetch_sector_from_yfinance(ticker)
print(f" Sector: {sector}")
print()
print("\nFull metadata test for US69553P1003:")
metadata = fetch_metadata('US69553P1003')
for key, value in metadata.items():
print(f" {key}: {value}")

80
scripts/verify_fix.py Normal file
View File

@@ -0,0 +1,80 @@
def mock_get_analytics(
metric: str = "volume",
group_by: str = "day",
sub_group_by: str = None,
date_from: str = None,
date_to: str = None,
isins: str = None,
continents: str = None
):
# Determine if we need to join metadata
needs_metadata = any([
group_by in ["name", "continent", "sector"],
sub_group_by in ["name", "continent", "sector"],
continents is not None
])
# Use prefixes only if joining
t_prefix = "t." if needs_metadata else ""
m_prefix = "m." if needs_metadata else ""
metrics_map = {
"volume": f"sum({t_prefix}price * {t_prefix}quantity)",
"count": f"count(*)",
"avg_price": f"avg({t_prefix}price)"
}
groups_map = {
"day": f"date_trunc('day', {t_prefix}timestamp)",
"month": f"date_trunc('month', {t_prefix}timestamp)",
"exchange": f"{t_prefix}exchange",
"isin": f"{t_prefix}isin",
"name": f"coalesce({m_prefix}name, {t_prefix}isin)" if needs_metadata else "isin",
"continent": f"coalesce({m_prefix}continent, 'Unknown')" if needs_metadata else "'Unknown'",
"sector": f"coalesce({m_prefix}sector, 'Unknown')" if needs_metadata else "'Unknown'"
}
selected_metric = metrics_map.get(metric, metrics_map["volume"])
selected_group = groups_map.get(group_by, groups_map["day"])
query = f"select {selected_group} as label"
if sub_group_by and sub_group_by in groups_map:
query += f", {groups_map[sub_group_by]} as sub_label"
query += f", {selected_metric} as value from trades"
if needs_metadata:
query += " t left join metadata m on t.isin = m.isin"
query += " where 1=1"
if date_from:
query += f" and {t_prefix}timestamp >= '{date_from}'"
if date_to:
query += f" and {t_prefix}timestamp <= '{date_to}'"
if isins:
isins_list = ",".join([f"'{i.strip()}'" for i in isins.split(",")])
query += f" and {t_prefix}isin in ({isins_list})"
if continents and needs_metadata:
cont_list = ",".join([f"'{c.strip()}'" for c in continents.split(",")])
query += f" and {m_prefix}continent in ({cont_list})"
query += " group by label"
if sub_group_by and sub_group_by in groups_map:
query += ", sub_label"
query += " order by label asc"
return query
# Test cases
print("Case 1: Basic analytics (reported error case)")
print(mock_get_analytics(metric="volume", group_by="day", date_from="2025-12-26", date_to="2026-01-25", isins="DE000LS1LUS9"))
print("\nCase 2: Filtering by continent (should join)")
print(mock_get_analytics(metric="volume", group_by="day", continents="Europe"))
print("\nCase 3: Grouping by name (should join)")
print(mock_get_analytics(metric="count", group_by="name"))