This commit is contained in:
@@ -84,35 +84,76 @@ def main():
|
|||||||
|
|
||||||
# 3. Erstelle bereinigte Tabelle
|
# 3. Erstelle bereinigte Tabelle
|
||||||
print("\n3. Erstelle bereinigte Tabelle 'trades_clean'...")
|
print("\n3. Erstelle bereinigte Tabelle 'trades_clean'...")
|
||||||
|
print(" HINWEIS: Bei großen Datenmengen kann dies mehrere Minuten dauern...")
|
||||||
|
|
||||||
# Lösche alte clean-Tabelle falls vorhanden
|
# Lösche alte clean-Tabelle falls vorhanden
|
||||||
execute_query("DROP TABLE IF EXISTS trades_clean")
|
execute_query("DROP TABLE IF EXISTS trades_clean")
|
||||||
|
|
||||||
# Erstelle neue Tabelle mit DISTINCT auf allen relevanten Feldern
|
# QuestDB: SAMPLE BY 1T mit LATEST ON für Deduplizierung
|
||||||
# QuestDB: Wir erstellen eine neue Tabelle mit DISTINCT
|
# Das gruppiert nach Timestamp (auf Nanosekunde genau) und behält nur den letzten Eintrag
|
||||||
create_clean_query = """
|
# Alternative: Wir verwenden GROUP BY mit MIN/MAX
|
||||||
CREATE TABLE trades_clean AS (
|
|
||||||
SELECT DISTINCT
|
# Erst die Tabelle erstellen
|
||||||
exchange,
|
create_table_query = """
|
||||||
symbol,
|
CREATE TABLE trades_clean (
|
||||||
isin,
|
exchange SYMBOL,
|
||||||
price,
|
symbol SYMBOL,
|
||||||
quantity,
|
isin SYMBOL,
|
||||||
timestamp
|
price DOUBLE,
|
||||||
FROM trades
|
quantity DOUBLE,
|
||||||
) TIMESTAMP(timestamp) PARTITION BY DAY WAL
|
timestamp TIMESTAMP
|
||||||
|
) TIMESTAMP(timestamp) PARTITION BY DAY WAL DEDUP UPSERT KEYS(timestamp, exchange, isin, price, quantity)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = execute_query(create_clean_query, timeout=600)
|
result = execute_query(create_table_query, timeout=60)
|
||||||
if result is None:
|
if result is None:
|
||||||
print("Fehler beim Erstellen der bereinigten Tabelle!")
|
print(" Fehler beim Erstellen der Tabellenstruktur!")
|
||||||
return
|
# Fallback: Ohne DEDUP
|
||||||
|
create_table_query = """
|
||||||
|
CREATE TABLE trades_clean (
|
||||||
|
exchange SYMBOL,
|
||||||
|
symbol SYMBOL,
|
||||||
|
isin SYMBOL,
|
||||||
|
price DOUBLE,
|
||||||
|
quantity DOUBLE,
|
||||||
|
timestamp TIMESTAMP
|
||||||
|
) TIMESTAMP(timestamp) PARTITION BY DAY WAL
|
||||||
|
"""
|
||||||
|
execute_query(create_table_query, timeout=60)
|
||||||
|
|
||||||
|
# Dann Daten einfügen mit INSERT ... SELECT (ohne LIMIT!)
|
||||||
|
print(" Kopiere Daten (ohne Duplikate)...")
|
||||||
|
insert_query = """
|
||||||
|
INSERT INTO trades_clean
|
||||||
|
SELECT exchange, symbol, isin, price, quantity, timestamp
|
||||||
|
FROM (
|
||||||
|
SELECT exchange, symbol, isin, price, quantity, timestamp,
|
||||||
|
row_number() OVER (PARTITION BY exchange, isin, timestamp, price, quantity ORDER BY timestamp) as rn
|
||||||
|
FROM trades
|
||||||
|
)
|
||||||
|
WHERE rn = 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = execute_query(insert_query, timeout=3600) # 1 Stunde Timeout
|
||||||
|
if result is None:
|
||||||
|
print(" Fehler bei INSERT - versuche alternative Methode...")
|
||||||
|
# Fallback: Direkte Kopie ohne Deduplizierung über SQL
|
||||||
|
# Stattdessen per ILP deduplizieren
|
||||||
|
insert_simple = "INSERT INTO trades_clean SELECT * FROM trades"
|
||||||
|
execute_query(insert_simple, timeout=3600)
|
||||||
|
|
||||||
clean_count = get_table_count("trades_clean")
|
clean_count = get_table_count("trades_clean")
|
||||||
print(f" Bereinigte Tabelle erstellt: {clean_count:,} Trades")
|
print(f" Bereinigte Tabelle: {clean_count:,} Trades")
|
||||||
|
|
||||||
|
if clean_count == 0:
|
||||||
|
print(" FEHLER: Keine Daten kopiert!")
|
||||||
|
return
|
||||||
|
|
||||||
removed = original_count - clean_count
|
removed = original_count - clean_count
|
||||||
print(f" Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
|
if removed > 0:
|
||||||
|
print(f" Entfernte Duplikate: {removed:,} ({removed/original_count*100:.1f}%)")
|
||||||
|
else:
|
||||||
|
print(" Keine Duplikate durch SQL entfernt (DEDUP wird bei neuen Inserts aktiv)")
|
||||||
|
|
||||||
# 4. Ersetze alte Tabelle
|
# 4. Ersetze alte Tabelle
|
||||||
print("\n4. Ersetze alte Tabelle...")
|
print("\n4. Ersetze alte Tabelle...")
|
||||||
|
|||||||
125
restore_and_fix.py
Normal file
125
restore_and_fix.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Script zum Wiederherstellen und korrekten Bereinigen der Trades.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
DB_HOST = os.getenv("QUESTDB_HOST", "localhost")
|
||||||
|
DB_PORT = os.getenv("QUESTDB_PORT", "9000")
|
||||||
|
DB_USER = os.getenv("DB_USER", "admin")
|
||||||
|
DB_PASSWORD = os.getenv("DB_PASSWORD", "quest")
|
||||||
|
|
||||||
|
DB_URL = f"http://{DB_HOST}:{DB_PORT}"
|
||||||
|
DB_AUTH = (DB_USER, DB_PASSWORD) if DB_USER and DB_PASSWORD else None
|
||||||
|
|
||||||
|
def execute_query(query, timeout=300):
|
||||||
|
"""Führt eine QuestDB Query aus."""
|
||||||
|
try:
|
||||||
|
response = requests.get(
|
||||||
|
f"{DB_URL}/exec",
|
||||||
|
params={'query': query, 'count': 'true'},
|
||||||
|
auth=DB_AUTH,
|
||||||
|
timeout=timeout
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
else:
|
||||||
|
print(f"Query failed: {response.text[:500]}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error executing query: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_table_count(table_name):
|
||||||
|
"""Zählt Einträge in einer Tabelle."""
|
||||||
|
result = execute_query(f"SELECT count(*) FROM {table_name}")
|
||||||
|
if result and result.get('dataset'):
|
||||||
|
return result['dataset'][0][0]
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def table_exists(table_name):
|
||||||
|
"""Prüft ob eine Tabelle existiert."""
|
||||||
|
result = execute_query(f"SELECT count(*) FROM {table_name} LIMIT 1")
|
||||||
|
return result is not None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("QuestDB Daten-Wiederherstellung und Bereinigung")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# 1. Prüfe aktuellen Stand
|
||||||
|
current_count = get_table_count("trades")
|
||||||
|
print(f"\n1. Aktuelle Trades-Tabelle: {current_count:,} Einträge")
|
||||||
|
|
||||||
|
# 2. Prüfe ob Backup existiert
|
||||||
|
backup_exists = table_exists("trades_backup")
|
||||||
|
if backup_exists:
|
||||||
|
backup_count = get_table_count("trades_backup")
|
||||||
|
print(f" Backup-Tabelle gefunden: {backup_count:,} Einträge")
|
||||||
|
|
||||||
|
if backup_count > current_count:
|
||||||
|
print("\n2. Backup hat mehr Daten - Wiederherstellung möglich!")
|
||||||
|
|
||||||
|
response = input(" Backup wiederherstellen? (j/n): ")
|
||||||
|
if response.lower() == 'j':
|
||||||
|
print(" Lösche aktuelle Tabelle...")
|
||||||
|
execute_query("DROP TABLE trades")
|
||||||
|
|
||||||
|
print(" Benenne Backup um...")
|
||||||
|
execute_query("RENAME TABLE trades_backup TO trades")
|
||||||
|
|
||||||
|
new_count = get_table_count("trades")
|
||||||
|
print(f" Wiederhergestellt: {new_count:,} Trades")
|
||||||
|
else:
|
||||||
|
print(" Backup hat weniger/gleich viele Daten - keine Wiederherstellung nötig")
|
||||||
|
else:
|
||||||
|
print(" Kein Backup gefunden!")
|
||||||
|
|
||||||
|
# 3. Zeige Statistik pro Exchange
|
||||||
|
print("\n3. Trades pro Exchange:")
|
||||||
|
result = execute_query("""
|
||||||
|
SELECT exchange, count(*) as cnt
|
||||||
|
FROM trades
|
||||||
|
GROUP BY exchange
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
""")
|
||||||
|
if result and result.get('dataset'):
|
||||||
|
for row in result['dataset']:
|
||||||
|
print(f" {row[0]}: {row[1]:,}")
|
||||||
|
|
||||||
|
# 4. Aktiviere DEDUP für zukünftige Inserts
|
||||||
|
print("\n4. Prüfe DEDUP-Status...")
|
||||||
|
# QuestDB: DEDUP kann nur bei Tabellenerstellung gesetzt werden
|
||||||
|
# Wir können aber eine neue Tabelle mit DEDUP erstellen
|
||||||
|
|
||||||
|
print("\n5. Empfehlung:")
|
||||||
|
print(" - Die Deduplizierung sollte im daemon.py erfolgen (bereits implementiert)")
|
||||||
|
print(" - Der Hash-basierte Check verhindert zukünftige Duplikate")
|
||||||
|
print(" - Für bestehende Duplikate: Manuelles Cleanup in Batches")
|
||||||
|
|
||||||
|
# 6. Zeige Duplikat-Analyse für eine Exchange
|
||||||
|
print("\n6. Stichproben-Analyse für Duplikate...")
|
||||||
|
result = execute_query("""
|
||||||
|
SELECT exchange, isin, timestamp, price, quantity, count(*) as cnt
|
||||||
|
FROM trades
|
||||||
|
WHERE exchange = 'EIX'
|
||||||
|
GROUP BY exchange, isin, timestamp, price, quantity
|
||||||
|
HAVING count(*) > 1
|
||||||
|
LIMIT 10
|
||||||
|
""", timeout=120)
|
||||||
|
|
||||||
|
if result and result.get('dataset') and len(result['dataset']) > 0:
|
||||||
|
print(" Gefundene Duplikate (Beispiele):")
|
||||||
|
for row in result['dataset'][:5]:
|
||||||
|
print(f" {row[0]} | {row[1]} | {row[2]} | {row[3]} | {row[4]} | {row[5]}x")
|
||||||
|
else:
|
||||||
|
print(" Keine Duplikate in EIX gefunden (oder Query timeout)")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Fertig!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Binary file not shown.
@@ -133,32 +133,31 @@ class GettexExchange(BaseExchange):
|
|||||||
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
|
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
|
||||||
csv_text = f.read().decode('utf-8')
|
csv_text = f.read().decode('utf-8')
|
||||||
|
|
||||||
# Debug: Zeige erste Zeilen und Spalten
|
|
||||||
lines = csv_text.strip().split('\n')
|
lines = csv_text.strip().split('\n')
|
||||||
if lines:
|
if not lines:
|
||||||
print(f"[GETTEX] CSV has {len(lines)} lines, first line (headers): {lines[0][:200]}")
|
return []
|
||||||
if len(lines) > 1:
|
|
||||||
print(f"[GETTEX] Sample data row: {lines[1][:200]}")
|
|
||||||
|
|
||||||
# CSV parsen - versuche verschiedene Delimiter
|
# Extrahiere Datum aus Dateinamen (Format: posttrade.YYYYMMDD.HH.MM.xxx.csv.gz)
|
||||||
delimiter = ';' if ';' in lines[0] else ','
|
date_str = None
|
||||||
reader = csv.DictReader(io.StringIO(csv_text), delimiter=delimiter)
|
parts = filename.split('.')
|
||||||
|
if len(parts) >= 4:
|
||||||
|
date_str = parts[1] # YYYYMMDD
|
||||||
|
|
||||||
row_count = 0
|
# Gettex CSV hat KEINEN Header!
|
||||||
for row in reader:
|
# Format: ISIN,Zeit,Währung,Preis,Menge
|
||||||
row_count += 1
|
# z.B.: DE000BAY0017,09:15:03.638460,EUR,45.775,22
|
||||||
if row_count == 1:
|
for line in lines:
|
||||||
print(f"[GETTEX] CSV columns: {list(row.keys())}")
|
if not line.strip():
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
trade = self._parse_csv_row(row)
|
trade = self._parse_headerless_csv_line(line, date_str)
|
||||||
if trade:
|
if trade:
|
||||||
trades.append(trade)
|
trades.append(trade)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
if row_count <= 3:
|
|
||||||
print(f"[GETTEX] Error parsing row {row_count}: {e}, row keys: {list(row.keys())}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"[GETTEX] Processed {row_count} rows, found {len(trades)} valid trades")
|
if trades:
|
||||||
|
print(f"[GETTEX] Parsed {len(trades)} trades from {filename}")
|
||||||
|
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
if e.response.status_code != 404:
|
if e.response.status_code != 404:
|
||||||
@@ -168,6 +167,69 @@ class GettexExchange(BaseExchange):
|
|||||||
|
|
||||||
return trades
|
return trades
|
||||||
|
|
||||||
|
def _parse_headerless_csv_line(self, line: str, date_str: str = None) -> Optional[Trade]:
|
||||||
|
"""
|
||||||
|
Parst eine headerlose CSV-Zeile im gettex Format.
|
||||||
|
Format: ISIN,Zeit,Währung,Preis,Menge
|
||||||
|
z.B.: DE000BAY0017,09:15:03.638460,EUR,45.775,22
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parts = line.strip().split(',')
|
||||||
|
if len(parts) < 5:
|
||||||
|
return None
|
||||||
|
|
||||||
|
isin = parts[0].strip()
|
||||||
|
time_str = parts[1].strip()
|
||||||
|
# currency = parts[2].strip() # nicht benötigt
|
||||||
|
price_str = parts[3].strip()
|
||||||
|
qty_str = parts[4].strip()
|
||||||
|
|
||||||
|
# Validierung
|
||||||
|
if not isin or len(isin) != 12: # ISIN ist immer 12 Zeichen
|
||||||
|
return None
|
||||||
|
|
||||||
|
price = float(price_str)
|
||||||
|
quantity = float(qty_str)
|
||||||
|
|
||||||
|
if price <= 0 or quantity <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Timestamp bauen
|
||||||
|
# date_str ist YYYYMMDD, time_str ist HH:MM:SS.ffffff
|
||||||
|
if date_str and len(date_str) == 8:
|
||||||
|
year = date_str[:4]
|
||||||
|
month = date_str[4:6]
|
||||||
|
day = date_str[6:8]
|
||||||
|
date_part = f"{year}-{month}-{day}"
|
||||||
|
else:
|
||||||
|
# Fallback: heute
|
||||||
|
date_part = datetime.now(timezone.utc).strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# Zeit parsen (z.B. 09:15:03.638460)
|
||||||
|
ts_str = f"{date_part}T{time_str}"
|
||||||
|
|
||||||
|
# Mikrosekunden kürzen wenn zu lang
|
||||||
|
if '.' in ts_str:
|
||||||
|
base, frac = ts_str.rsplit('.', 1)
|
||||||
|
if len(frac) > 6:
|
||||||
|
frac = frac[:6]
|
||||||
|
ts_str = f"{base}.{frac}"
|
||||||
|
|
||||||
|
timestamp = datetime.fromisoformat(ts_str)
|
||||||
|
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
return Trade(
|
||||||
|
exchange=self.name,
|
||||||
|
symbol=isin,
|
||||||
|
isin=isin,
|
||||||
|
price=price,
|
||||||
|
quantity=quantity,
|
||||||
|
timestamp=timestamp
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
def _parse_csv_row(self, row: dict) -> Optional[Trade]:
|
def _parse_csv_row(self, row: dict) -> Optional[Trade]:
|
||||||
"""
|
"""
|
||||||
Parst eine CSV-Zeile zu einem Trade.
|
Parst eine CSV-Zeile zu einem Trade.
|
||||||
@@ -390,29 +452,30 @@ class GettexExchange(BaseExchange):
|
|||||||
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
|
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
|
||||||
csv_text = f.read().decode('utf-8')
|
csv_text = f.read().decode('utf-8')
|
||||||
|
|
||||||
# Debug: Zeige erste Zeilen
|
|
||||||
lines = csv_text.strip().split('\n')
|
lines = csv_text.strip().split('\n')
|
||||||
if len(lines) <= 1:
|
if not lines:
|
||||||
# Datei ist leer oder nur Header
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# CSV parsen - versuche verschiedene Delimiter
|
# Extrahiere Datum aus Dateinamen (Format: posttrade.YYYYMMDD.HH.MM.xxx.csv.gz)
|
||||||
delimiter = ';' if ';' in lines[0] else (',' if ',' in lines[0] else '\t')
|
date_str = None
|
||||||
reader = csv.DictReader(io.StringIO(csv_text), delimiter=delimiter)
|
parts = filename.split('.')
|
||||||
|
if len(parts) >= 4:
|
||||||
|
date_str = parts[1] # YYYYMMDD
|
||||||
|
|
||||||
row_count = 0
|
# Gettex CSV hat KEINEN Header!
|
||||||
for row in reader:
|
# Format: ISIN,Zeit,Währung,Preis,Menge
|
||||||
row_count += 1
|
for line in lines:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
trade = self._parse_csv_row(row)
|
trade = self._parse_headerless_csv_line(line, date_str)
|
||||||
if trade:
|
if trade:
|
||||||
trades.append(trade)
|
trades.append(trade)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
if row_count <= 2:
|
|
||||||
print(f"[{self.name}] Error parsing row: {e}, keys: {list(row.keys())[:5]}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"[{self.name}] Parsed {len(trades)} trades from {filename}")
|
if trades:
|
||||||
|
print(f"[{self.name}] Parsed {len(trades)} trades from {filename}")
|
||||||
|
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
if e.response.status_code != 404:
|
if e.response.status_code != 404:
|
||||||
|
|||||||
Reference in New Issue
Block a user