from __future__ import annotations import json import re from pathlib import Path from typing import Any from openpyxl import load_workbook from pypdf import PdfReader ROOT = Path("dev/immobilien 07-05-2026/AZIZI SharePoint") OUTPUT = Path("dev/immobilien 07-05-2026/azizi-extraction.json") PROJECTS: dict[str, dict[str, Any]] = { "mina": { "title": "Mina", "aliases": ["mina"], "dirs": ["Palm Jumeirah/Mina", "Beno Photography/Mina"], "official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/mina", }, "royal-bay": { "title": "Royal Bay", "aliases": ["royal bay"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/royal-bay", }, "creek-views": { "title": "Creek Views", "aliases": ["creek views"], "dirs": ["AI Jaddaf and DHCC/Creek Views"], "official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views", }, "creek-views-ii": { "title": "Creek Views II", "aliases": ["creek views ii", "creek views 2"], "dirs": ["AI Jaddaf and DHCC/Creek Views II"], "official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views-ii", }, "azizi-riviera": { "title": "Azizi Riviera", "aliases": ["riviera"], "dirs": ["MBR City/Riviera", "Beno Photography/Riviera Boulevard", "Beno Photography/Riviera Lagoon"], "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera", }, "riviera-reve": { "title": "Riviera Reve", "aliases": ["reve", "rĂªve"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-reve", }, "riviera-beachfront": { "title": "Riviera Beachfront", "aliases": ["riviera beachfront"], "dirs": ["Off-Plan Projects Azizi/MBR City/Riviera Beachfront"], "official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-beachfront", }, "park-avenue": { "title": "Park Avenue", "aliases": ["park avenue"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/meydan/park-avenue", }, "star": { "title": "Star", "aliases": ["star"], "dirs": ["Al Furjan/Star"], "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/star", }, "roy-mediterranean": { "title": "ROY Mediterranean", "aliases": ["roy mediterranean", "roy"], "dirs": ["Al Furjan/Roy"], "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/roy-mediterranean", }, "farishta": { "title": "Farishta", "aliases": ["farishta", "faristha"], "dirs": ["Al Furjan/Faristha"], "official_url": "https://www.azizidevelopments.com/dubai/al-furjan/farishta", }, "azizi-grand": { "title": "Azizi Grand", "aliases": ["azizi grand", "grand"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/dubai-sports-city/azizi-grand", }, "beach-oasis": { "title": "Beach Oasis", "aliases": ["beach oasis"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/studio-city/beach-oasis", }, "burj-azizi": { "title": "Burj Azizi", "aliases": ["burj azizi"], "dirs": [], "official_url": "https://www.burjazizi.com/", }, "monaco-mansions": { "title": "Monaco Mansions", "aliases": ["monaco mansions", "monaco"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/dubai-south/monaco-mansions", }, "azizi-venice": { "title": "Azizi Venice", "aliases": ["azizi venice", "venice"], "dirs": ["Dubai South/Azizi Venice"], "official_url": "https://www.azizidevelopments.com/dubai/dubai-south/azizi-venice", }, "azizi-milan": { "title": "Azizi Milan", "aliases": ["azizi milan", "milan"], "dirs": ["Azizi Milan"], "official_url": "https://www.azizidevelopments.com/dubai/azizi-milan/azizi-milan", }, "azizi-wasel": { "title": "Azizi Wasel", "aliases": ["azizi wasel", "wasel"], "dirs": [], "official_url": "https://www.azizidevelopments.com/dubai/dubai-islands/azizi-wasel", }, } def clean_text(value: str) -> str: return re.sub(r"\s+", " ", value).strip() def json_value(value: Any) -> Any: if hasattr(value, "isoformat"): return value.isoformat() return value def is_small_facts_pdf(path: Path) -> bool: lower = path.name.lower() return path.stat().st_size <= 7_000_000 and any(token in lower for token in ["fact", "payment plan"]) def extract_pdf_text(path: Path) -> str: try: reader = PdfReader(str(path)) text = "\n".join((page.extract_text() or "") for page in reader.pages[:2]) except Exception as exc: return f"[PDF extraction failed: {exc}]" return clean_text(text)[:3500] def extract_xlsx(path: Path) -> list[list[Any]]: rows: list[list[Any]] = [] try: workbook = load_workbook(path, data_only=True, read_only=True) except Exception as exc: return [["XLSX extraction failed", str(exc)]] for sheet in workbook.worksheets[:2]: seen = 0 for row in sheet.iter_rows(): values = [cell.value for cell in row] compact = [json_value(value) for value in values if value is not None] if compact: rows.append([sheet.title, *compact[:8]]) seen += 1 if seen >= 28: break return rows def file_matches(path: Path, aliases: list[str]) -> bool: haystack = str(path).lower() return any(alias.lower() in haystack for alias in aliases) def candidate_files(project: dict[str, Any], all_files: list[Path]) -> list[Path]: candidates: list[Path] = [] for rel_dir in project["dirs"]: base = ROOT / rel_dir if base.exists(): candidates.extend([path for path in base.rglob("*") if path.is_file()]) candidates.extend(path for path in all_files if file_matches(path, project["aliases"])) unique: dict[str, Path] = {} for path in candidates: unique[str(path)] = path return list(unique.values()) def summarize_project(slug: str, project: dict[str, Any], all_files: list[Path]) -> dict[str, Any]: files = candidate_files(project, all_files) images = [ str(path) for path in files if path.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"} and path.stat().st_size <= 25_000_000 ] xlsx_files = [path for path in files if path.suffix.lower() == ".xlsx" and path.stat().st_size <= 8_000_000] pdf_files = [path for path in files if path.suffix.lower() == ".pdf" and is_small_facts_pdf(path)] return { "slug": slug, "title": project["title"], "official_url": project["official_url"], "source_dirs": [str(ROOT / rel_dir) for rel_dir in project["dirs"] if (ROOT / rel_dir).exists()], "coverage": { "files": len(files), "images": len(images), "xlsx": len(xlsx_files), "small_fact_pdfs": len(pdf_files), }, "image_candidates": images[:12], "xlsx_extracts": [ { "path": str(path), "rows": extract_xlsx(path), } for path in xlsx_files[:4] ], "pdf_extracts": [ { "path": str(path), "text": extract_pdf_text(path), } for path in pdf_files[:3] ], } def main() -> None: all_files = [path for path in ROOT.rglob("*") if path.is_file()] extraction = { "source_root": str(ROOT), "note": "Automated extraction from small factsheets, project-information spreadsheets and image filenames. Large brochures are intentionally skipped.", "projects": { slug: summarize_project(slug, project, all_files) for slug, project in PROJECTS.items() }, } OUTPUT.write_text(json.dumps(extraction, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Wrote {OUTPUT}") if __name__ == "__main__": main()