250 lines
8.3 KiB
Python
250 lines
8.3 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from openpyxl import load_workbook
|
|
from pypdf import PdfReader
|
|
|
|
|
|
ROOT = Path("dev/immobilien 07-05-2026/AZIZI SharePoint")
|
|
OUTPUT = Path("dev/immobilien 07-05-2026/azizi-extraction.json")
|
|
|
|
PROJECTS: dict[str, dict[str, Any]] = {
|
|
"mina": {
|
|
"title": "Mina",
|
|
"aliases": ["mina"],
|
|
"dirs": ["Palm Jumeirah/Mina", "Beno Photography/Mina"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/mina",
|
|
},
|
|
"royal-bay": {
|
|
"title": "Royal Bay",
|
|
"aliases": ["royal bay"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/royal-bay",
|
|
},
|
|
"creek-views": {
|
|
"title": "Creek Views",
|
|
"aliases": ["creek views"],
|
|
"dirs": ["AI Jaddaf and DHCC/Creek Views"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views",
|
|
},
|
|
"creek-views-ii": {
|
|
"title": "Creek Views II",
|
|
"aliases": ["creek views ii", "creek views 2"],
|
|
"dirs": ["AI Jaddaf and DHCC/Creek Views II"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views-ii",
|
|
},
|
|
"azizi-riviera": {
|
|
"title": "Azizi Riviera",
|
|
"aliases": ["riviera"],
|
|
"dirs": ["MBR City/Riviera", "Beno Photography/Riviera Boulevard", "Beno Photography/Riviera Lagoon"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera",
|
|
},
|
|
"riviera-reve": {
|
|
"title": "Riviera Reve",
|
|
"aliases": ["reve", "rêve"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-reve",
|
|
},
|
|
"riviera-beachfront": {
|
|
"title": "Riviera Beachfront",
|
|
"aliases": ["riviera beachfront"],
|
|
"dirs": ["Off-Plan Projects Azizi/MBR City/Riviera Beachfront"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-beachfront",
|
|
},
|
|
"park-avenue": {
|
|
"title": "Park Avenue",
|
|
"aliases": ["park avenue"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/meydan/park-avenue",
|
|
},
|
|
"star": {
|
|
"title": "Star",
|
|
"aliases": ["star"],
|
|
"dirs": ["Al Furjan/Star"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/star",
|
|
},
|
|
"roy-mediterranean": {
|
|
"title": "ROY Mediterranean",
|
|
"aliases": ["roy mediterranean", "roy"],
|
|
"dirs": ["Al Furjan/Roy"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/roy-mediterranean",
|
|
},
|
|
"farishta": {
|
|
"title": "Farishta",
|
|
"aliases": ["farishta", "faristha"],
|
|
"dirs": ["Al Furjan/Faristha"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/farishta",
|
|
},
|
|
"azizi-grand": {
|
|
"title": "Azizi Grand",
|
|
"aliases": ["azizi grand", "grand"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-sports-city/azizi-grand",
|
|
},
|
|
"beach-oasis": {
|
|
"title": "Beach Oasis",
|
|
"aliases": ["beach oasis"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/studio-city/beach-oasis",
|
|
},
|
|
"burj-azizi": {
|
|
"title": "Burj Azizi",
|
|
"aliases": ["burj azizi"],
|
|
"dirs": [],
|
|
"official_url": "https://www.burjazizi.com/",
|
|
},
|
|
"monaco-mansions": {
|
|
"title": "Monaco Mansions",
|
|
"aliases": ["monaco mansions", "monaco"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/monaco-mansions",
|
|
},
|
|
"azizi-venice": {
|
|
"title": "Azizi Venice",
|
|
"aliases": ["azizi venice", "venice"],
|
|
"dirs": ["Dubai South/Azizi Venice"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/azizi-venice",
|
|
},
|
|
"azizi-milan": {
|
|
"title": "Azizi Milan",
|
|
"aliases": ["azizi milan", "milan"],
|
|
"dirs": ["Azizi Milan"],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/azizi-milan/azizi-milan",
|
|
},
|
|
"azizi-wasel": {
|
|
"title": "Azizi Wasel",
|
|
"aliases": ["azizi wasel", "wasel"],
|
|
"dirs": [],
|
|
"official_url": "https://www.azizidevelopments.com/dubai/dubai-islands/azizi-wasel",
|
|
},
|
|
}
|
|
|
|
|
|
def clean_text(value: str) -> str:
|
|
return re.sub(r"\s+", " ", value).strip()
|
|
|
|
|
|
def json_value(value: Any) -> Any:
|
|
if hasattr(value, "isoformat"):
|
|
return value.isoformat()
|
|
|
|
return value
|
|
|
|
|
|
def is_small_facts_pdf(path: Path) -> bool:
|
|
lower = path.name.lower()
|
|
return path.stat().st_size <= 7_000_000 and any(token in lower for token in ["fact", "payment plan"])
|
|
|
|
|
|
def extract_pdf_text(path: Path) -> str:
|
|
try:
|
|
reader = PdfReader(str(path))
|
|
text = "\n".join((page.extract_text() or "") for page in reader.pages[:2])
|
|
except Exception as exc:
|
|
return f"[PDF extraction failed: {exc}]"
|
|
|
|
return clean_text(text)[:3500]
|
|
|
|
|
|
def extract_xlsx(path: Path) -> list[list[Any]]:
|
|
rows: list[list[Any]] = []
|
|
try:
|
|
workbook = load_workbook(path, data_only=True, read_only=True)
|
|
except Exception as exc:
|
|
return [["XLSX extraction failed", str(exc)]]
|
|
|
|
for sheet in workbook.worksheets[:2]:
|
|
seen = 0
|
|
for row in sheet.iter_rows():
|
|
values = [cell.value for cell in row]
|
|
compact = [json_value(value) for value in values if value is not None]
|
|
if compact:
|
|
rows.append([sheet.title, *compact[:8]])
|
|
seen += 1
|
|
if seen >= 28:
|
|
break
|
|
|
|
return rows
|
|
|
|
|
|
def file_matches(path: Path, aliases: list[str]) -> bool:
|
|
haystack = str(path).lower()
|
|
return any(alias.lower() in haystack for alias in aliases)
|
|
|
|
|
|
def candidate_files(project: dict[str, Any], all_files: list[Path]) -> list[Path]:
|
|
candidates: list[Path] = []
|
|
for rel_dir in project["dirs"]:
|
|
base = ROOT / rel_dir
|
|
if base.exists():
|
|
candidates.extend([path for path in base.rglob("*") if path.is_file()])
|
|
|
|
candidates.extend(path for path in all_files if file_matches(path, project["aliases"]))
|
|
|
|
unique: dict[str, Path] = {}
|
|
for path in candidates:
|
|
unique[str(path)] = path
|
|
|
|
return list(unique.values())
|
|
|
|
|
|
def summarize_project(slug: str, project: dict[str, Any], all_files: list[Path]) -> dict[str, Any]:
|
|
files = candidate_files(project, all_files)
|
|
images = [
|
|
str(path)
|
|
for path in files
|
|
if path.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"} and path.stat().st_size <= 25_000_000
|
|
]
|
|
xlsx_files = [path for path in files if path.suffix.lower() == ".xlsx" and path.stat().st_size <= 8_000_000]
|
|
pdf_files = [path for path in files if path.suffix.lower() == ".pdf" and is_small_facts_pdf(path)]
|
|
|
|
return {
|
|
"slug": slug,
|
|
"title": project["title"],
|
|
"official_url": project["official_url"],
|
|
"source_dirs": [str(ROOT / rel_dir) for rel_dir in project["dirs"] if (ROOT / rel_dir).exists()],
|
|
"coverage": {
|
|
"files": len(files),
|
|
"images": len(images),
|
|
"xlsx": len(xlsx_files),
|
|
"small_fact_pdfs": len(pdf_files),
|
|
},
|
|
"image_candidates": images[:12],
|
|
"xlsx_extracts": [
|
|
{
|
|
"path": str(path),
|
|
"rows": extract_xlsx(path),
|
|
}
|
|
for path in xlsx_files[:4]
|
|
],
|
|
"pdf_extracts": [
|
|
{
|
|
"path": str(path),
|
|
"text": extract_pdf_text(path),
|
|
}
|
|
for path in pdf_files[:3]
|
|
],
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
all_files = [path for path in ROOT.rglob("*") if path.is_file()]
|
|
extraction = {
|
|
"source_root": str(ROOT),
|
|
"note": "Automated extraction from small factsheets, project-information spreadsheets and image filenames. Large brochures are intentionally skipped.",
|
|
"projects": {
|
|
slug: summarize_project(slug, project, all_files)
|
|
for slug, project in PROJECTS.items()
|
|
},
|
|
}
|
|
|
|
OUTPUT.write_text(json.dumps(extraction, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"Wrote {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|