b2in/dev/immobilien 07-05-2026/extract_azizi_sources.py
2026-05-08 10:41:03 +02:00

250 lines
8.3 KiB
Python

from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any
from openpyxl import load_workbook
from pypdf import PdfReader
ROOT = Path("dev/immobilien 07-05-2026/AZIZI SharePoint")
OUTPUT = Path("dev/immobilien 07-05-2026/azizi-extraction.json")
PROJECTS: dict[str, dict[str, Any]] = {
"mina": {
"title": "Mina",
"aliases": ["mina"],
"dirs": ["Palm Jumeirah/Mina", "Beno Photography/Mina"],
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/mina",
},
"royal-bay": {
"title": "Royal Bay",
"aliases": ["royal bay"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/royal-bay",
},
"creek-views": {
"title": "Creek Views",
"aliases": ["creek views"],
"dirs": ["AI Jaddaf and DHCC/Creek Views"],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views",
},
"creek-views-ii": {
"title": "Creek Views II",
"aliases": ["creek views ii", "creek views 2"],
"dirs": ["AI Jaddaf and DHCC/Creek Views II"],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views-ii",
},
"azizi-riviera": {
"title": "Azizi Riviera",
"aliases": ["riviera"],
"dirs": ["MBR City/Riviera", "Beno Photography/Riviera Boulevard", "Beno Photography/Riviera Lagoon"],
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera",
},
"riviera-reve": {
"title": "Riviera Reve",
"aliases": ["reve", "rêve"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-reve",
},
"riviera-beachfront": {
"title": "Riviera Beachfront",
"aliases": ["riviera beachfront"],
"dirs": ["Off-Plan Projects Azizi/MBR City/Riviera Beachfront"],
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-beachfront",
},
"park-avenue": {
"title": "Park Avenue",
"aliases": ["park avenue"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/meydan/park-avenue",
},
"star": {
"title": "Star",
"aliases": ["star"],
"dirs": ["Al Furjan/Star"],
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/star",
},
"roy-mediterranean": {
"title": "ROY Mediterranean",
"aliases": ["roy mediterranean", "roy"],
"dirs": ["Al Furjan/Roy"],
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/roy-mediterranean",
},
"farishta": {
"title": "Farishta",
"aliases": ["farishta", "faristha"],
"dirs": ["Al Furjan/Faristha"],
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/farishta",
},
"azizi-grand": {
"title": "Azizi Grand",
"aliases": ["azizi grand", "grand"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-sports-city/azizi-grand",
},
"beach-oasis": {
"title": "Beach Oasis",
"aliases": ["beach oasis"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/studio-city/beach-oasis",
},
"burj-azizi": {
"title": "Burj Azizi",
"aliases": ["burj azizi"],
"dirs": [],
"official_url": "https://www.burjazizi.com/",
},
"monaco-mansions": {
"title": "Monaco Mansions",
"aliases": ["monaco mansions", "monaco"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/monaco-mansions",
},
"azizi-venice": {
"title": "Azizi Venice",
"aliases": ["azizi venice", "venice"],
"dirs": ["Dubai South/Azizi Venice"],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/azizi-venice",
},
"azizi-milan": {
"title": "Azizi Milan",
"aliases": ["azizi milan", "milan"],
"dirs": ["Azizi Milan"],
"official_url": "https://www.azizidevelopments.com/dubai/azizi-milan/azizi-milan",
},
"azizi-wasel": {
"title": "Azizi Wasel",
"aliases": ["azizi wasel", "wasel"],
"dirs": [],
"official_url": "https://www.azizidevelopments.com/dubai/dubai-islands/azizi-wasel",
},
}
def clean_text(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()
def json_value(value: Any) -> Any:
if hasattr(value, "isoformat"):
return value.isoformat()
return value
def is_small_facts_pdf(path: Path) -> bool:
lower = path.name.lower()
return path.stat().st_size <= 7_000_000 and any(token in lower for token in ["fact", "payment plan"])
def extract_pdf_text(path: Path) -> str:
try:
reader = PdfReader(str(path))
text = "\n".join((page.extract_text() or "") for page in reader.pages[:2])
except Exception as exc:
return f"[PDF extraction failed: {exc}]"
return clean_text(text)[:3500]
def extract_xlsx(path: Path) -> list[list[Any]]:
rows: list[list[Any]] = []
try:
workbook = load_workbook(path, data_only=True, read_only=True)
except Exception as exc:
return [["XLSX extraction failed", str(exc)]]
for sheet in workbook.worksheets[:2]:
seen = 0
for row in sheet.iter_rows():
values = [cell.value for cell in row]
compact = [json_value(value) for value in values if value is not None]
if compact:
rows.append([sheet.title, *compact[:8]])
seen += 1
if seen >= 28:
break
return rows
def file_matches(path: Path, aliases: list[str]) -> bool:
haystack = str(path).lower()
return any(alias.lower() in haystack for alias in aliases)
def candidate_files(project: dict[str, Any], all_files: list[Path]) -> list[Path]:
candidates: list[Path] = []
for rel_dir in project["dirs"]:
base = ROOT / rel_dir
if base.exists():
candidates.extend([path for path in base.rglob("*") if path.is_file()])
candidates.extend(path for path in all_files if file_matches(path, project["aliases"]))
unique: dict[str, Path] = {}
for path in candidates:
unique[str(path)] = path
return list(unique.values())
def summarize_project(slug: str, project: dict[str, Any], all_files: list[Path]) -> dict[str, Any]:
files = candidate_files(project, all_files)
images = [
str(path)
for path in files
if path.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"} and path.stat().st_size <= 25_000_000
]
xlsx_files = [path for path in files if path.suffix.lower() == ".xlsx" and path.stat().st_size <= 8_000_000]
pdf_files = [path for path in files if path.suffix.lower() == ".pdf" and is_small_facts_pdf(path)]
return {
"slug": slug,
"title": project["title"],
"official_url": project["official_url"],
"source_dirs": [str(ROOT / rel_dir) for rel_dir in project["dirs"] if (ROOT / rel_dir).exists()],
"coverage": {
"files": len(files),
"images": len(images),
"xlsx": len(xlsx_files),
"small_fact_pdfs": len(pdf_files),
},
"image_candidates": images[:12],
"xlsx_extracts": [
{
"path": str(path),
"rows": extract_xlsx(path),
}
for path in xlsx_files[:4]
],
"pdf_extracts": [
{
"path": str(path),
"text": extract_pdf_text(path),
}
for path in pdf_files[:3]
],
}
def main() -> None:
all_files = [path for path in ROOT.rglob("*") if path.is_file()]
extraction = {
"source_root": str(ROOT),
"note": "Automated extraction from small factsheets, project-information spreadsheets and image filenames. Large brochures are intentionally skipped.",
"projects": {
slug: summarize_project(slug, project, all_files)
for slug, project in PROJECTS.items()
},
}
OUTPUT.write_text(json.dumps(extraction, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Wrote {OUTPUT}")
if __name__ == "__main__":
main()