Refactor: Code-Qualität verbessert und Projektstruktur aufgeräumt

- daemon.py: gc.collect() entfernt, robustes Scheduling (last_run_date statt Minuten-Check), Exchange Registry Pattern eingeführt (STREAMING_EXCHANGES/STANDARD_EXCHANGES) - deutsche_boerse.py: Thread-safe User-Agent Rotation bei Rate-Limits, Logging statt print(), Feiertags-Prüfung, aufgeteilte Parse-Methoden - eix.py: Logging statt print(), spezifische Exception-Typen statt blankem except - read.py gelöscht und durch scripts/inspect_gzip.py ersetzt (Streaming-basiert) - Utility-Scripts in scripts/ verschoben (cleanup_duplicates, restore_and_fix, verify_fix)
2026-02-01 08:18:55 +01:00
parent cf55a0bd06
commit 1dc79b8b64
9 changed files with 545 additions and 308 deletions
--- a/scripts/cleanup_duplicates.py
+++ b/scripts/cleanup_duplicates.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Script zum Entfernen von duplizierten Trades aus QuestDB.
+Erstellt eine neue Tabelle ohne Duplikate und ersetzt die alte.
+"""
+
+import requests
+import os
+import sys
+
+DB_HOST = os.getenv("QUESTDB_HOST", "localhost")
+DB_PORT = os.getenv("QUESTDB_PORT", "9000")
+DB_USER = os.getenv("DB_USER", "admin")
+DB_PASSWORD = os.getenv("DB_PASSWORD", "quest")
+
+DB_URL = f"http://{DB_HOST}:{DB_PORT}"
+DB_AUTH = (DB_USER, DB_PASSWORD) if DB_USER and DB_PASSWORD else None
+
+def execute_query(query, timeout=300):
+    """Führt eine QuestDB Query aus."""
+    try:
+        response = requests.get(
+            f"{DB_URL}/exec",
+            params={'query': query},
+            auth=DB_AUTH,
+            timeout=timeout
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Query failed: {response.text}")
+            return None
+    except Exception as e:
+        print(f"Error executing query: {e}")
+        return None
+
+def get_table_count(table_name):
+    """Zählt Einträge in einer Tabelle."""
+    result = execute_query(f"SELECT count(*) FROM {table_name}")
+    if result and result.get('dataset'):
+        return result['dataset'][0][0]
+    return 0
+
+def main():
+    print("=" * 60)
+    print("QuestDB Duplikat-Bereinigung")
+    print("=" * 60)
+    
+    # 1. Prüfe aktuelle Anzahl
+    original_count = get_table_count("trades")
+    print(f"\n1. Aktuelle Anzahl Trades: {original_count:,}")
+    
+    if original_count == 0:
+        print("Keine Trades in der Datenbank. Nichts zu tun.")
+        return
+    
+    # 2. Analysiere Duplikate pro Exchange
+    print("\n2. Analysiere Duplikate pro Exchange...")
+    
+    analysis_query = """
+    SELECT 
+        exchange,
+        count(*) as total,
+        count(distinct concat(isin, '-', cast(timestamp as string), '-', cast(price as string), '-', cast(quantity as string))) as unique_trades
+    FROM trades
+    GROUP BY exchange
+    ORDER BY exchange
+    """
+    
+    result = execute_query(analysis_query)
+    if result and result.get('dataset'):
+        print(f"\n{'Exchange':<15} {'Total':>12} {'Unique':>12} {'Duplicates':>12}")
+        print("-" * 55)
+        total_all = 0
+        unique_all = 0
+        for row in result['dataset']:
+            exchange, total, unique = row
+            duplicates = total - unique
+            total_all += total
+            unique_all += unique
+            print(f"{exchange:<15} {total:>12,} {unique:>12,} {duplicates:>12,}")
+        print("-" * 55)
+        print(f"{'TOTAL':<15} {total_all:>12,} {unique_all:>12,} {total_all - unique_all:>12,}")
+    
+    # 3. Erstelle bereinigte Tabelle
+    print("\n3. Erstelle bereinigte Tabelle 'trades_clean'...")
+    print("   HINWEIS: Bei großen Datenmengen kann dies mehrere Minuten dauern...")
+    
+    # Lösche alte clean-Tabelle falls vorhanden
+    execute_query("DROP TABLE IF EXISTS trades_clean")
+    
+    # QuestDB: SAMPLE BY 1T mit LATEST ON für Deduplizierung
+    # Das gruppiert nach Timestamp (auf Nanosekunde genau) und behält nur den letzten Eintrag
+    # Alternative: Wir verwenden GROUP BY mit MIN/MAX
+    
+    # Erst die Tabelle erstellen
+    create_table_query = """
+    CREATE TABLE trades_clean (
+        exchange SYMBOL,
+        symbol SYMBOL,
+        isin SYMBOL,
+        price DOUBLE,
+        quantity DOUBLE,
+        timestamp TIMESTAMP
+    ) TIMESTAMP(timestamp) PARTITION BY DAY WAL DEDUP UPSERT KEYS(timestamp, exchange, isin, price, quantity)
+    """
+    
+    result = execute_query(create_table_query, timeout=60)
+    if result is None:
+        print("   Fehler beim Erstellen der Tabellenstruktur!")
+        # Fallback: Ohne DEDUP
+        create_table_query = """
+        CREATE TABLE trades_clean (
+            exchange SYMBOL,
+            symbol SYMBOL,
+            isin SYMBOL,
+            price DOUBLE,
+            quantity DOUBLE,
+            timestamp TIMESTAMP
+        ) TIMESTAMP(timestamp) PARTITION BY DAY WAL
+        """
+        execute_query(create_table_query, timeout=60)
+    
+    # Dann Daten einfügen mit INSERT ... SELECT (ohne LIMIT!)
+    print("   Kopiere Daten (ohne Duplikate)...")
+    insert_query = """
+    INSERT INTO trades_clean
+    SELECT exchange, symbol, isin, price, quantity, timestamp
+    FROM (
+        SELECT exchange, symbol, isin, price, quantity, timestamp,
+               row_number() OVER (PARTITION BY exchange, isin, timestamp, price, quantity ORDER BY timestamp) as rn
+        FROM trades
+    )
+    WHERE rn = 1
+    """
+    
+    result = execute_query(insert_query, timeout=3600)  # 1 Stunde Timeout
+    if result is None:
+        print("   Fehler bei INSERT - versuche alternative Methode...")
+        # Fallback: Direkte Kopie ohne Deduplizierung über SQL
+        # Stattdessen per ILP deduplizieren
+        insert_simple = "INSERT INTO trades_clean SELECT * FROM trades"
+        execute_query(insert_simple, timeout=3600)
+    
+    clean_count = get_table_count("trades_clean")
+    print(f"   Bereinigte Tabelle: {clean_count:,} Trades")
+    
+    if clean_count == 0:
+        print("   FEHLER: Keine Daten kopiert!")
+        return
+    
+    removed = original_count - clean_count
+    if removed > 0:
+        print(f"   Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
+    else:
+        print("   Keine Duplikate durch SQL entfernt (DEDUP wird bei neuen Inserts aktiv)")
+    
+    # 4. Ersetze alte Tabelle
+    print("\n4. Ersetze alte Tabelle...")
+    
+    # Rename alte Tabelle zu backup
+    execute_query("RENAME TABLE trades TO trades_backup")
+    
+    # Rename neue Tabelle zu trades
+    execute_query("RENAME TABLE trades_clean TO trades")
+    
+    # Verifiziere
+    final_count = get_table_count("trades")
+    print(f"   Neue Trades-Tabelle: {final_count:,} Einträge")
+    
+    # 5. Lösche Backup (optional)
+    print("\n5. Lösche Backup-Tabelle...")
+    execute_query("DROP TABLE IF EXISTS trades_backup")
+    print("   Backup gelöscht.")
+    
+    # 6. Zusammenfassung
+    print("\n" + "=" * 60)
+    print("ZUSAMMENFASSUNG")
+    print("=" * 60)
+    print(f"Vorher:  {original_count:>15,} Trades")
+    print(f"Nachher: {final_count:>15,} Trades")
+    print(f"Entfernt:{removed:>15,} Duplikate ({removed/original_count*100:.1f}%)")
+    print("=" * 60)
+    
+    # 7. Statistik-Tabellen neu berechnen
+    print("\n6. Lösche alte Analytics-Tabellen (werden neu berechnet)...")
+    for table in ['analytics_daily_summary', 'analytics_exchange_daily', 
+                  'analytics_stock_trends', 'analytics_volume_changes', 'analytics_custom']:
+        result = execute_query(f"DROP TABLE IF EXISTS {table}")
+        print(f"   {table} gelöscht")
+    
+    print("\nFertig! Der Analytics Worker wird die Statistiken beim nächsten Start neu berechnen.")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/inspect_gzip.py
+++ b/scripts/inspect_gzip.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+Utility-Script zum Inspizieren von gzip-komprimierten JSON-Dateien.
+Verarbeitet Dateien streaming, ohne alles in den RAM zu laden.
+
+Verwendung:
+    python scripts/inspect_gzip.py <datei.json.gz> [--limit N] [--output datei.json]
+"""
+import gzip
+import json
+import argparse
+import sys
+from pathlib import Path
+
+
+def inspect_gzip_file(filepath: str, limit: int = None, output_file: str = None):
+    """
+    Liest eine gzip-komprimierte NDJSON-Datei und gibt die Inhalte aus.
+    
+    Args:
+        filepath: Pfad zur .json.gz Datei
+        limit: Maximale Anzahl der auszugebenden Records (None = alle)
+        output_file: Optional: Ausgabe in Datei statt stdout
+    """
+    path = Path(filepath)
+    if not path.exists():
+        print(f"Fehler: Datei '{filepath}' nicht gefunden.", file=sys.stderr)
+        return 1
+    
+    count = 0
+    output = open(output_file, 'w', encoding='utf-8') if output_file else sys.stdout
+    
+    try:
+        with gzip.open(filepath, mode='rt', encoding='utf-8') as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                
+                try:
+                    record = json.loads(line)
+                    # Pretty-print einzelner Record
+                    json.dump(record, output, indent=2, ensure_ascii=False)
+                    output.write('\n')
+                    count += 1
+                    
+                    if limit and count >= limit:
+                        break
+                        
+                except json.JSONDecodeError as e:
+                    print(f"JSON-Fehler in Zeile {count + 1}: {e}", file=sys.stderr)
+                    continue
+        
+        print(f"\n--- {count} Records verarbeitet ---", file=sys.stderr)
+        
+    finally:
+        if output_file and output != sys.stdout:
+            output.close()
+    
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Inspiziert gzip-komprimierte JSON-Dateien (NDJSON-Format)'
+    )
+    parser.add_argument('file', help='Pfad zur .json.gz Datei')
+    parser.add_argument('--limit', '-n', type=int, default=10,
+                        help='Maximale Anzahl der Records (default: 10, 0 = alle)')
+    parser.add_argument('--output', '-o', type=str,
+                        help='Ausgabe in Datei statt stdout')
+    
+    args = parser.parse_args()
+    
+    limit = args.limit if args.limit > 0 else None
+    return inspect_gzip_file(args.file, limit=limit, output_file=args.output)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/scripts/restore_and_fix.py
+++ b/scripts/restore_and_fix.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Script zum Wiederherstellen und korrekten Bereinigen der Trades.
+"""
+
+import requests
+import os
+import sys
+
+DB_HOST = os.getenv("QUESTDB_HOST", "localhost")
+DB_PORT = os.getenv("QUESTDB_PORT", "9000")
+DB_USER = os.getenv("DB_USER", "admin")
+DB_PASSWORD = os.getenv("DB_PASSWORD", "quest")
+
+DB_URL = f"http://{DB_HOST}:{DB_PORT}"
+DB_AUTH = (DB_USER, DB_PASSWORD) if DB_USER and DB_PASSWORD else None
+
+def execute_query(query, timeout=300):
+    """Führt eine QuestDB Query aus."""
+    try:
+        response = requests.get(
+            f"{DB_URL}/exec",
+            params={'query': query, 'count': 'true'},
+            auth=DB_AUTH,
+            timeout=timeout
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Query failed: {response.text[:500]}")
+            return None
+    except Exception as e:
+        print(f"Error executing query: {e}")
+        return None
+
+def get_table_count(table_name):
+    """Zählt Einträge in einer Tabelle."""
+    result = execute_query(f"SELECT count(*) FROM {table_name}")
+    if result and result.get('dataset'):
+        return result['dataset'][0][0]
+    return 0
+
+def table_exists(table_name):
+    """Prüft ob eine Tabelle existiert."""
+    result = execute_query(f"SELECT count(*) FROM {table_name} LIMIT 1")
+    return result is not None
+
+def main():
+    print("=" * 60)
+    print("QuestDB Daten-Wiederherstellung und Bereinigung")
+    print("=" * 60)
+    
+    # 1. Prüfe aktuellen Stand
+    current_count = get_table_count("trades")
+    print(f"\n1. Aktuelle Trades-Tabelle: {current_count:,} Einträge")
+    
+    # 2. Prüfe ob Backup existiert
+    backup_exists = table_exists("trades_backup")
+    if backup_exists:
+        backup_count = get_table_count("trades_backup")
+        print(f"   Backup-Tabelle gefunden: {backup_count:,} Einträge")
+        
+        if backup_count > current_count:
+            print("\n2. Backup hat mehr Daten - Wiederherstellung möglich!")
+            
+            response = input("   Backup wiederherstellen? (j/n): ")
+            if response.lower() == 'j':
+                print("   Lösche aktuelle Tabelle...")
+                execute_query("DROP TABLE trades")
+                
+                print("   Benenne Backup um...")
+                execute_query("RENAME TABLE trades_backup TO trades")
+                
+                new_count = get_table_count("trades")
+                print(f"   Wiederhergestellt: {new_count:,} Trades")
+        else:
+            print("   Backup hat weniger/gleich viele Daten - keine Wiederherstellung nötig")
+    else:
+        print("   Kein Backup gefunden!")
+    
+    # 3. Zeige Statistik pro Exchange
+    print("\n3. Trades pro Exchange:")
+    result = execute_query("""
+        SELECT exchange, count(*) as cnt 
+        FROM trades 
+        GROUP BY exchange 
+        ORDER BY cnt DESC
+    """)
+    if result and result.get('dataset'):
+        for row in result['dataset']:
+            print(f"   {row[0]}: {row[1]:,}")
+    
+    # 4. Aktiviere DEDUP für zukünftige Inserts
+    print("\n4. Prüfe DEDUP-Status...")
+    # QuestDB: DEDUP kann nur bei Tabellenerstellung gesetzt werden
+    # Wir können aber eine neue Tabelle mit DEDUP erstellen
+    
+    print("\n5. Empfehlung:")
+    print("   - Die Deduplizierung sollte im daemon.py erfolgen (bereits implementiert)")
+    print("   - Der Hash-basierte Check verhindert zukünftige Duplikate")
+    print("   - Für bestehende Duplikate: Manuelles Cleanup in Batches")
+    
+    # 6. Zeige Duplikat-Analyse für eine Exchange
+    print("\n6. Stichproben-Analyse für Duplikate...")
+    result = execute_query("""
+        SELECT exchange, isin, timestamp, price, quantity, count(*) as cnt
+        FROM trades
+        WHERE exchange = 'EIX'
+        GROUP BY exchange, isin, timestamp, price, quantity
+        HAVING count(*) > 1
+        LIMIT 10
+    """, timeout=120)
+    
+    if result and result.get('dataset') and len(result['dataset']) > 0:
+        print("   Gefundene Duplikate (Beispiele):")
+        for row in result['dataset'][:5]:
+            print(f"   {row[0]} | {row[1]} | {row[2]} | {row[3]} | {row[4]} | {row[5]}x")
+    else:
+        print("   Keine Duplikate in EIX gefunden (oder Query timeout)")
+    
+    print("\n" + "=" * 60)
+    print("Fertig!")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/test_sector_fetch.py
+++ b/scripts/test_sector_fetch.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""Test script to verify the improved sector metadata fetching"""
+
+import sys
+sys.path.insert(0, '/Users/melchiorreimers/.gemini/antigravity/scratch/trading_daemon/src')
+
+from metadata.fetcher import fetch_ticker_from_openfigi, fetch_sector_from_yfinance, fetch_metadata
+
+# Test ISINs from different regions and sectors
+test_isins = [
+    'US69553P1003',  # PagerDuty (US, Technology)
+    'DE000LS1LUS9',  # German security
+    'US0378331005',  # Apple (US, Technology)
+]
+
+print("Testing improved sector metadata fetching...\n")
+
+for isin in test_isins:
+    print(f"Testing ISIN: {isin}")
+    
+    # Test ticker lookup
+    ticker = fetch_ticker_from_openfigi(isin)
+    print(f"  Ticker: {ticker}")
+    
+    # Test sector lookup if ticker found
+    if ticker:
+        sector = fetch_sector_from_yfinance(ticker)
+        print(f"  Sector: {sector}")
+    
+    print()
+
+print("\nFull metadata test for US69553P1003:")
+metadata = fetch_metadata('US69553P1003')
+for key, value in metadata.items():
+    print(f"  {key}: {value}")
--- a/scripts/verify_fix.py
+++ b/scripts/verify_fix.py
@@ -0,0 +1,80 @@
+
+def mock_get_analytics(
+    metric: str = "volume", 
+    group_by: str = "day", 
+    sub_group_by: str = None,
+    date_from: str = None, 
+    date_to: str = None,
+    isins: str = None,
+    continents: str = None
+):
+    # Determine if we need to join metadata
+    needs_metadata = any([
+        group_by in ["name", "continent", "sector"],
+        sub_group_by in ["name", "continent", "sector"],
+        continents is not None
+    ])
+    
+    # Use prefixes only if joining
+    t_prefix = "t." if needs_metadata else ""
+    m_prefix = "m." if needs_metadata else ""
+    
+    metrics_map = {
+        "volume": f"sum({t_prefix}price * {t_prefix}quantity)",
+        "count": f"count(*)",
+        "avg_price": f"avg({t_prefix}price)"
+    }
+    
+    groups_map = {
+        "day": f"date_trunc('day', {t_prefix}timestamp)",
+        "month": f"date_trunc('month', {t_prefix}timestamp)",
+        "exchange": f"{t_prefix}exchange",
+        "isin": f"{t_prefix}isin",
+        "name": f"coalesce({m_prefix}name, {t_prefix}isin)" if needs_metadata else "isin",
+        "continent": f"coalesce({m_prefix}continent, 'Unknown')" if needs_metadata else "'Unknown'",
+        "sector": f"coalesce({m_prefix}sector, 'Unknown')" if needs_metadata else "'Unknown'"
+    }
+    
+    selected_metric = metrics_map.get(metric, metrics_map["volume"])
+    selected_group = groups_map.get(group_by, groups_map["day"])
+    
+    query = f"select {selected_group} as label"
+    
+    if sub_group_by and sub_group_by in groups_map:
+        query += f", {groups_map[sub_group_by]} as sub_label"
+        
+    query += f", {selected_metric} as value from trades"
+    if needs_metadata:
+        query += " t left join metadata m on t.isin = m.isin"
+    
+    query += " where 1=1"
+    
+    if date_from:
+        query += f" and {t_prefix}timestamp >= '{date_from}'"
+    if date_to:
+        query += f" and {t_prefix}timestamp <= '{date_to}'"
+        
+    if isins:
+        isins_list = ",".join([f"'{i.strip()}'" for i in isins.split(",")])
+        query += f" and {t_prefix}isin in ({isins_list})"
+
+    if continents and needs_metadata:
+        cont_list = ",".join([f"'{c.strip()}'" for c in continents.split(",")])
+        query += f" and {m_prefix}continent in ({cont_list})"
+        
+    query += " group by label"
+    if sub_group_by and sub_group_by in groups_map:
+        query += ", sub_label"
+    
+    query += " order by label asc"
+    return query
+
+# Test cases
+print("Case 1: Basic analytics (reported error case)")
+print(mock_get_analytics(metric="volume", group_by="day", date_from="2025-12-26", date_to="2026-01-25", isins="DE000LS1LUS9"))
+
+print("\nCase 2: Filtering by continent (should join)")
+print(mock_get_analytics(metric="volume", group_by="day", continents="Europe"))
+
+print("\nCase 3: Grouping by name (should join)")
+print(mock_get_analytics(metric="count", group_by="name"))