import requests import gzip import json import io from datetime import datetime, timedelta, timezone from typing import List, Optional from .base import BaseExchange, Trade from bs4 import BeautifulSoup # Browser User-Agent für Zugriff HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } class DeutscheBoerseBase(BaseExchange): """Basisklasse für Deutsche Börse Exchanges (Xetra, Frankfurt, Quotrix)""" @property def base_url(self) -> str: """Override in subclasses""" raise NotImplementedError @property def name(self) -> str: raise NotImplementedError def _get_file_list(self) -> List[str]: """Parst die Verzeichnisseite und extrahiert alle Dateinamen""" import re try: response = requests.get(self.base_url, headers=HEADERS, timeout=30) response.raise_for_status() files = [] html_text = response.text # Debug: Response-Länge print(f"[{self.name}] Response length: {len(html_text)} chars") # Primär: Regex-basierte Extraktion (zuverlässiger) # Pattern: PREFIX-posttrade-YYYY-MM-DDTHH_MM.json.gz # Das Prefix wird aus der base_url extrahiert (z.B. DETR, DFRA, DGAT) prefix_match = re.search(r'/([A-Z]{4})-posttrade', self.base_url) if prefix_match: prefix = prefix_match.group(1) # Suche nach Dateinamen mit diesem Prefix pattern = f'{prefix}-posttrade-\\d{{4}}-\\d{{2}}-\\d{{2}}T\\d{{2}}_\\d{{2}}\\.json\\.gz' else: # Generisches Pattern pattern = r'[A-Z]{4}-posttrade-\d{4}-\d{2}-\d{2}T\d{2}_\d{2}\.json\.gz' matches = re.findall(pattern, html_text) files = list(set(matches)) # Sekundär: BeautifulSoup für Links (falls Regex nichts findet) if not files: soup = BeautifulSoup(html_text, 'html.parser') all_links = soup.find_all('a') print(f"[{self.name}] Found {len(all_links)} total links on page") for link in all_links: href = link.get('href', '') text = link.get_text(strip=True) # Prüfe href und Text für posttrade Dateien if href and 'posttrade' in href.lower() and '.json.gz' in href.lower(): # Extrahiere nur den Dateinamen filename = href.split('/')[-1] if '/' in href else href files.append(filename) elif text and 'posttrade' in text.lower() and '.json.gz' in text.lower(): files.append(text) # Tertiär: Suche nach jedem "posttrade" im HTML und extrahiere Dateinamen if not files: # Allgemeineres Pattern für beliebige Dateinamen mit "posttrade" general_pattern = r'[\w-]*posttrade[\w-]*\d{4}[-_]\d{2}[-_]\d{2}[T_]\d{2}[_:]\d{2}\.json\.gz' matches = re.findall(general_pattern, html_text, re.IGNORECASE) files = list(set(matches)) if files: print(f"[{self.name}] Found {len(files)} files via general pattern") print(f"[{self.name}] Found {len(files)} files via regex/soup") return files except Exception as e: print(f"Error fetching file list from {self.base_url}: {e}") return [] def _filter_files_for_date(self, files: List[str], target_date: datetime.date) -> List[str]: """ Filtert Dateien für ein bestimmtes Datum. Dateiformat: DETR-posttrade-YYYY-MM-DDTHH_MM.json.gz (mit Unterstrich!) Da Handel bis 22:00 MEZ geht (21:00/20:00 UTC), müssen wir auch Dateien nach Mitternacht UTC berücksichtigen. """ import re filtered = [] # Für den Vortag: Dateien vom target_date UND vom Folgetag (bis ~02:00 UTC) target_str = target_date.strftime('%Y-%m-%d') next_day = target_date + timedelta(days=1) next_day_str = next_day.strftime('%Y-%m-%d') for file in files: # Extrahiere Datum aus Dateiname # Format: DETR-posttrade-2026-01-26T21_30.json.gz if target_str in file: filtered.append(file) elif next_day_str in file: # Prüfe ob es eine frühe Datei vom nächsten Tag ist (< 03:00 UTC) try: # Finde Timestamp im Dateinamen mit Unterstrich für Minuten match = re.search(r'posttrade-(\d{4}-\d{2}-\d{2})T(\d{2})_(\d{2})', file) if match: hour = int(match.group(2)) if hour < 3: # Frühe Morgenstunden gehören noch zum Vortag filtered.append(file) except Exception: pass return filtered def _download_and_parse_file(self, file_url: str) -> List[Trade]: """Lädt eine JSON.gz Datei herunter und parst die Trades""" trades = [] try: # Vollständige URL erstellen # Format: https://mfs.deutsche-boerse.com/DETR-posttrade/DETR-posttrade-2026-01-27T08_53.json.gz if not file_url.startswith('http'): # Entferne führenden Slash falls vorhanden filename = file_url.lstrip('/') full_url = f"{self.base_url}/{filename}" else: full_url = file_url response = requests.get(full_url, headers=HEADERS, timeout=60) if response.status_code == 404: print(f"[{self.name}] File not found: {full_url}") return [] response.raise_for_status() print(f"[{self.name}] Downloaded: {full_url} ({len(response.content)} bytes)") # Gzip entpacken with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f: json_data = json.load(f) # Trades parsen # Deutsche Börse JSON Format (RTS1/RTS2): # Typische Felder: TrdDt, TrdTm, ISIN, Pric, Qty, TrdCcy, etc. for record in json_data: try: trade = self._parse_trade_record(record) if trade: trades.append(trade) except Exception as e: print(f"Error parsing trade record: {e}") continue except Exception as e: print(f"Error downloading/parsing {file_url}: {e}") return trades def _parse_trade_record(self, record: dict) -> Optional[Trade]: """ Parst einen einzelnen Trade-Record aus dem JSON. Deutsche Börse verwendet RTS1/RTS2 Format. Wichtige Felder: - TrdDt: Trading Date (YYYY-MM-DD) - TrdTm: Trading Time (HH:MM:SS.ffffff) - ISIN: Instrument Identifier - FinInstrmId.Id: Alternative ISIN Feld - Pric.Pric.MntryVal.Amt: Preis - Qty.Unit: Menge """ try: # ISIN extrahieren isin = record.get('ISIN') or record.get('FinInstrmId', {}).get('Id', '') if not isin: return None # Preis extrahieren (verschiedene mögliche Pfade) price = None if 'Pric' in record: pric = record['Pric'] if isinstance(pric, dict): if 'Pric' in pric: inner = pric['Pric'] if 'MntryVal' in inner: price = float(inner['MntryVal'].get('Amt', 0)) elif 'Amt' in inner: price = float(inner['Amt']) elif 'MntryVal' in pric: price = float(pric['MntryVal'].get('Amt', 0)) elif isinstance(pric, (int, float)): price = float(pric) if price is None or price <= 0: return None # Menge extrahieren quantity = None if 'Qty' in record: qty = record['Qty'] if isinstance(qty, dict): quantity = float(qty.get('Unit', qty.get('Qty', 0))) elif isinstance(qty, (int, float)): quantity = float(qty) if quantity is None or quantity <= 0: return None # Timestamp extrahieren trd_dt = record.get('TrdDt', '') trd_tm = record.get('TrdTm', '00:00:00') if not trd_dt: return None # Kombiniere Datum und Zeit ts_str = f"{trd_dt}T{trd_tm}" # Entferne Mikrosekunden wenn zu lang if '.' in ts_str: parts = ts_str.split('.') if len(parts[1]) > 6: ts_str = parts[0] + '.' + parts[1][:6] # Parse als UTC (Deutsche Börse liefert UTC) timestamp = datetime.fromisoformat(ts_str) if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=timezone.utc) return Trade( exchange=self.name, symbol=isin, # Symbol = ISIN isin=isin, price=price, quantity=quantity, timestamp=timestamp ) except Exception as e: print(f"Error parsing record: {e}") return None def _generate_expected_files(self, target_date: datetime.date) -> List[str]: """ Generiert erwartete Dateinamen basierend auf dem bekannten Format. Format: PREFIX-posttrade-YYYY-MM-DDTHH_MM.json.gz """ import re files = [] # Extrahiere Prefix aus base_url (z.B. DETR, DFRA, DGAT) prefix_match = re.search(r'/([A-Z]{4})-posttrade', self.base_url) prefix = prefix_match.group(1) if prefix_match else 'DETR' date_str = target_date.strftime('%Y-%m-%d') # Generiere für alle Stunden des Handelstages (07:00 - 22:00 UTC, alle Minuten) for hour in range(7, 23): for minute in range(0, 60): files.append(f"{prefix}-posttrade-{date_str}T{hour:02d}_{minute:02d}.json.gz") # Auch frühe Dateien vom Folgetag (nach Mitternacht UTC) next_date = target_date + timedelta(days=1) next_date_str = next_date.strftime('%Y-%m-%d') for hour in range(0, 3): for minute in range(0, 60): files.append(f"{prefix}-posttrade-{next_date_str}T{hour:02d}_{minute:02d}.json.gz") return files def fetch_latest_trades(self, include_yesterday: bool = True, since_date: datetime = None) -> List[Trade]: """ Holt alle Trades vom Vortag (oder seit since_date). """ all_trades = [] # Bestimme Zieldatum if since_date: target_date = since_date.date() if hasattr(since_date, 'date') else since_date else: # Standard: Vortag target_date = (datetime.now(timezone.utc) - timedelta(days=1)).date() print(f"[{self.name}] Fetching trades for date: {target_date}") # Erst versuchen, Dateiliste von der Seite zu holen files = self._get_file_list() print(f"[{self.name}] Found {len(files)} total files") # Dateien für Zieldatum filtern target_files = self._filter_files_for_date(files, target_date) print(f"[{self.name}] {len(target_files)} files match target date") # Falls keine Dateien von der Seite gefunden, generiere erwartete Dateinamen if not target_files: print(f"[{self.name}] No files from page, trying generated filenames...") target_files = self._generate_expected_files(target_date) print(f"[{self.name}] Trying {len(target_files)} potential files") # Alle passenden Dateien herunterladen und parsen successful = 0 for file in target_files: trades = self._download_and_parse_file(file) if trades: all_trades.extend(trades) successful += 1 if successful <= 5: print(f"[{self.name}] Parsed {len(trades)} trades from {file}") if successful > 5: print(f"[{self.name}] ... and {successful - 5} more files") print(f"[{self.name}] Total trades fetched: {len(all_trades)}") return all_trades class XetraExchange(DeutscheBoerseBase): """Xetra (Deutsche Börse) - DETR""" @property def base_url(self) -> str: return "https://mfs.deutsche-boerse.com/DETR-posttrade" @property def name(self) -> str: return "XETRA" class FrankfurtExchange(DeutscheBoerseBase): """Börse Frankfurt - DFRA""" @property def base_url(self) -> str: return "https://mfs.deutsche-boerse.com/DFRA-posttrade" @property def name(self) -> str: return "FRA" class QuotrixExchange(DeutscheBoerseBase): """Quotrix (Düsseldorf/Tradegate) - DGAT""" @property def base_url(self) -> str: return "https://mfs.deutsche-boerse.com/DGAT-posttrade" @property def name(self) -> str: return "QUOTRIX"