This commit is contained in:
@@ -84,35 +84,76 @@ def main():
|
||||
|
||||
# 3. Erstelle bereinigte Tabelle
|
||||
print("\n3. Erstelle bereinigte Tabelle 'trades_clean'...")
|
||||
print(" HINWEIS: Bei großen Datenmengen kann dies mehrere Minuten dauern...")
|
||||
|
||||
# Lösche alte clean-Tabelle falls vorhanden
|
||||
execute_query("DROP TABLE IF EXISTS trades_clean")
|
||||
|
||||
# Erstelle neue Tabelle mit DISTINCT auf allen relevanten Feldern
|
||||
# QuestDB: Wir erstellen eine neue Tabelle mit DISTINCT
|
||||
create_clean_query = """
|
||||
CREATE TABLE trades_clean AS (
|
||||
SELECT DISTINCT
|
||||
exchange,
|
||||
symbol,
|
||||
isin,
|
||||
price,
|
||||
quantity,
|
||||
timestamp
|
||||
FROM trades
|
||||
) TIMESTAMP(timestamp) PARTITION BY DAY WAL
|
||||
# QuestDB: SAMPLE BY 1T mit LATEST ON für Deduplizierung
|
||||
# Das gruppiert nach Timestamp (auf Nanosekunde genau) und behält nur den letzten Eintrag
|
||||
# Alternative: Wir verwenden GROUP BY mit MIN/MAX
|
||||
|
||||
# Erst die Tabelle erstellen
|
||||
create_table_query = """
|
||||
CREATE TABLE trades_clean (
|
||||
exchange SYMBOL,
|
||||
symbol SYMBOL,
|
||||
isin SYMBOL,
|
||||
price DOUBLE,
|
||||
quantity DOUBLE,
|
||||
timestamp TIMESTAMP
|
||||
) TIMESTAMP(timestamp) PARTITION BY DAY WAL DEDUP UPSERT KEYS(timestamp, exchange, isin, price, quantity)
|
||||
"""
|
||||
|
||||
result = execute_query(create_clean_query, timeout=600)
|
||||
result = execute_query(create_table_query, timeout=60)
|
||||
if result is None:
|
||||
print("Fehler beim Erstellen der bereinigten Tabelle!")
|
||||
return
|
||||
print(" Fehler beim Erstellen der Tabellenstruktur!")
|
||||
# Fallback: Ohne DEDUP
|
||||
create_table_query = """
|
||||
CREATE TABLE trades_clean (
|
||||
exchange SYMBOL,
|
||||
symbol SYMBOL,
|
||||
isin SYMBOL,
|
||||
price DOUBLE,
|
||||
quantity DOUBLE,
|
||||
timestamp TIMESTAMP
|
||||
) TIMESTAMP(timestamp) PARTITION BY DAY WAL
|
||||
"""
|
||||
execute_query(create_table_query, timeout=60)
|
||||
|
||||
# Dann Daten einfügen mit INSERT ... SELECT (ohne LIMIT!)
|
||||
print(" Kopiere Daten (ohne Duplikate)...")
|
||||
insert_query = """
|
||||
INSERT INTO trades_clean
|
||||
SELECT exchange, symbol, isin, price, quantity, timestamp
|
||||
FROM (
|
||||
SELECT exchange, symbol, isin, price, quantity, timestamp,
|
||||
row_number() OVER (PARTITION BY exchange, isin, timestamp, price, quantity ORDER BY timestamp) as rn
|
||||
FROM trades
|
||||
)
|
||||
WHERE rn = 1
|
||||
"""
|
||||
|
||||
result = execute_query(insert_query, timeout=3600) # 1 Stunde Timeout
|
||||
if result is None:
|
||||
print(" Fehler bei INSERT - versuche alternative Methode...")
|
||||
# Fallback: Direkte Kopie ohne Deduplizierung über SQL
|
||||
# Stattdessen per ILP deduplizieren
|
||||
insert_simple = "INSERT INTO trades_clean SELECT * FROM trades"
|
||||
execute_query(insert_simple, timeout=3600)
|
||||
|
||||
clean_count = get_table_count("trades_clean")
|
||||
print(f" Bereinigte Tabelle erstellt: {clean_count:,} Trades")
|
||||
print(f" Bereinigte Tabelle: {clean_count:,} Trades")
|
||||
|
||||
if clean_count == 0:
|
||||
print(" FEHLER: Keine Daten kopiert!")
|
||||
return
|
||||
|
||||
removed = original_count - clean_count
|
||||
print(f" Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
|
||||
if removed > 0:
|
||||
print(f" Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
|
||||
else:
|
||||
print(" Keine Duplikate durch SQL entfernt (DEDUP wird bei neuen Inserts aktiv)")
|
||||
|
||||
# 4. Ersetze alte Tabelle
|
||||
print("\n4. Ersetze alte Tabelle...")
|
||||
|
||||
Reference in New Issue
Block a user