dev immobile

2026-05-08 10:41:03 +02:00 · 2026-05-08 10:41:03 +02:00 · e198d842ce
commit e198d842ce
parent 18ca2ce858
18 changed files with 5931 additions and 3 deletions
--- a/07-05-2026/extract_azizi_sources.py
+++ b/07-05-2026/extract_azizi_sources.py
@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+from openpyxl import load_workbook
+from pypdf import PdfReader
+
+
+ROOT = Path("dev/immobilien 07-05-2026/AZIZI SharePoint")
+OUTPUT = Path("dev/immobilien 07-05-2026/azizi-extraction.json")
+
+PROJECTS: dict[str, dict[str, Any]] = {
+    "mina": {
+        "title": "Mina",
+        "aliases": ["mina"],
+        "dirs": ["Palm Jumeirah/Mina", "Beno Photography/Mina"],
+        "official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/mina",
+    },
+    "royal-bay": {
+        "title": "Royal Bay",
+        "aliases": ["royal bay"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/royal-bay",
+    },
+    "creek-views": {
+        "title": "Creek Views",
+        "aliases": ["creek views"],
+        "dirs": ["AI Jaddaf and DHCC/Creek Views"],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views",
+    },
+    "creek-views-ii": {
+        "title": "Creek Views II",
+        "aliases": ["creek views ii", "creek views 2"],
+        "dirs": ["AI Jaddaf and DHCC/Creek Views II"],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views-ii",
+    },
+    "azizi-riviera": {
+        "title": "Azizi Riviera",
+        "aliases": ["riviera"],
+        "dirs": ["MBR City/Riviera", "Beno Photography/Riviera Boulevard", "Beno Photography/Riviera Lagoon"],
+        "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera",
+    },
+    "riviera-reve": {
+        "title": "Riviera Reve",
+        "aliases": ["reve", "rêve"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-reve",
+    },
+    "riviera-beachfront": {
+        "title": "Riviera Beachfront",
+        "aliases": ["riviera beachfront"],
+        "dirs": ["Off-Plan Projects Azizi/MBR City/Riviera Beachfront"],
+        "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-beachfront",
+    },
+    "park-avenue": {
+        "title": "Park Avenue",
+        "aliases": ["park avenue"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/meydan/park-avenue",
+    },
+    "star": {
+        "title": "Star",
+        "aliases": ["star"],
+        "dirs": ["Al Furjan/Star"],
+        "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/star",
+    },
+    "roy-mediterranean": {
+        "title": "ROY Mediterranean",
+        "aliases": ["roy mediterranean", "roy"],
+        "dirs": ["Al Furjan/Roy"],
+        "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/roy-mediterranean",
+    },
+    "farishta": {
+        "title": "Farishta",
+        "aliases": ["farishta", "faristha"],
+        "dirs": ["Al Furjan/Faristha"],
+        "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/farishta",
+    },
+    "azizi-grand": {
+        "title": "Azizi Grand",
+        "aliases": ["azizi grand", "grand"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-sports-city/azizi-grand",
+    },
+    "beach-oasis": {
+        "title": "Beach Oasis",
+        "aliases": ["beach oasis"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/studio-city/beach-oasis",
+    },
+    "burj-azizi": {
+        "title": "Burj Azizi",
+        "aliases": ["burj azizi"],
+        "dirs": [],
+        "official_url": "https://www.burjazizi.com/",
+    },
+    "monaco-mansions": {
+        "title": "Monaco Mansions",
+        "aliases": ["monaco mansions", "monaco"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-south/monaco-mansions",
+    },
+    "azizi-venice": {
+        "title": "Azizi Venice",
+        "aliases": ["azizi venice", "venice"],
+        "dirs": ["Dubai South/Azizi Venice"],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-south/azizi-venice",
+    },
+    "azizi-milan": {
+        "title": "Azizi Milan",
+        "aliases": ["azizi milan", "milan"],
+        "dirs": ["Azizi Milan"],
+        "official_url": "https://www.azizidevelopments.com/dubai/azizi-milan/azizi-milan",
+    },
+    "azizi-wasel": {
+        "title": "Azizi Wasel",
+        "aliases": ["azizi wasel", "wasel"],
+        "dirs": [],
+        "official_url": "https://www.azizidevelopments.com/dubai/dubai-islands/azizi-wasel",
+    },
+}
+
+
+def clean_text(value: str) -> str:
+    return re.sub(r"\s+", " ", value).strip()
+
+
+def json_value(value: Any) -> Any:
+    if hasattr(value, "isoformat"):
+        return value.isoformat()
+
+    return value
+
+
+def is_small_facts_pdf(path: Path) -> bool:
+    lower = path.name.lower()
+    return path.stat().st_size <= 7_000_000 and any(token in lower for token in ["fact", "payment plan"])
+
+
+def extract_pdf_text(path: Path) -> str:
+    try:
+        reader = PdfReader(str(path))
+        text = "\n".join((page.extract_text() or "") for page in reader.pages[:2])
+    except Exception as exc:
+        return f"[PDF extraction failed: {exc}]"
+
+    return clean_text(text)[:3500]
+
+
+def extract_xlsx(path: Path) -> list[list[Any]]:
+    rows: list[list[Any]] = []
+    try:
+        workbook = load_workbook(path, data_only=True, read_only=True)
+    except Exception as exc:
+        return [["XLSX extraction failed", str(exc)]]
+
+    for sheet in workbook.worksheets[:2]:
+        seen = 0
+        for row in sheet.iter_rows():
+            values = [cell.value for cell in row]
+            compact = [json_value(value) for value in values if value is not None]
+            if compact:
+                rows.append([sheet.title, *compact[:8]])
+                seen += 1
+            if seen >= 28:
+                break
+
+    return rows
+
+
+def file_matches(path: Path, aliases: list[str]) -> bool:
+    haystack = str(path).lower()
+    return any(alias.lower() in haystack for alias in aliases)
+
+
+def candidate_files(project: dict[str, Any], all_files: list[Path]) -> list[Path]:
+    candidates: list[Path] = []
+    for rel_dir in project["dirs"]:
+        base = ROOT / rel_dir
+        if base.exists():
+            candidates.extend([path for path in base.rglob("*") if path.is_file()])
+
+    candidates.extend(path for path in all_files if file_matches(path, project["aliases"]))
+
+    unique: dict[str, Path] = {}
+    for path in candidates:
+        unique[str(path)] = path
+
+    return list(unique.values())
+
+
+def summarize_project(slug: str, project: dict[str, Any], all_files: list[Path]) -> dict[str, Any]:
+    files = candidate_files(project, all_files)
+    images = [
+        str(path)
+        for path in files
+        if path.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"} and path.stat().st_size <= 25_000_000
+    ]
+    xlsx_files = [path for path in files if path.suffix.lower() == ".xlsx" and path.stat().st_size <= 8_000_000]
+    pdf_files = [path for path in files if path.suffix.lower() == ".pdf" and is_small_facts_pdf(path)]
+
+    return {
+        "slug": slug,
+        "title": project["title"],
+        "official_url": project["official_url"],
+        "source_dirs": [str(ROOT / rel_dir) for rel_dir in project["dirs"] if (ROOT / rel_dir).exists()],
+        "coverage": {
+            "files": len(files),
+            "images": len(images),
+            "xlsx": len(xlsx_files),
+            "small_fact_pdfs": len(pdf_files),
+        },
+        "image_candidates": images[:12],
+        "xlsx_extracts": [
+            {
+                "path": str(path),
+                "rows": extract_xlsx(path),
+            }
+            for path in xlsx_files[:4]
+        ],
+        "pdf_extracts": [
+            {
+                "path": str(path),
+                "text": extract_pdf_text(path),
+            }
+            for path in pdf_files[:3]
+        ],
+    }
+
+
+def main() -> None:
+    all_files = [path for path in ROOT.rglob("*") if path.is_file()]
+    extraction = {
+        "source_root": str(ROOT),
+        "note": "Automated extraction from small factsheets, project-information spreadsheets and image filenames. Large brochures are intentionally skipped.",
+        "projects": {
+            slug: summarize_project(slug, project, all_files)
+            for slug, project in PROJECTS.items()
+        },
+    }
+
+    OUTPUT.write_text(json.dumps(extraction, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Wrote {OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()