dev immobile
This commit is contained in:
parent
18ca2ce858
commit
e198d842ce
18 changed files with 5931 additions and 3 deletions
250
dev/immobilien 07-05-2026/extract_azizi_sources.py
Normal file
250
dev/immobilien 07-05-2026/extract_azizi_sources.py
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from openpyxl import load_workbook
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
ROOT = Path("dev/immobilien 07-05-2026/AZIZI SharePoint")
|
||||
OUTPUT = Path("dev/immobilien 07-05-2026/azizi-extraction.json")
|
||||
|
||||
PROJECTS: dict[str, dict[str, Any]] = {
|
||||
"mina": {
|
||||
"title": "Mina",
|
||||
"aliases": ["mina"],
|
||||
"dirs": ["Palm Jumeirah/Mina", "Beno Photography/Mina"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/mina",
|
||||
},
|
||||
"royal-bay": {
|
||||
"title": "Royal Bay",
|
||||
"aliases": ["royal bay"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/palm-jumeirah/royal-bay",
|
||||
},
|
||||
"creek-views": {
|
||||
"title": "Creek Views",
|
||||
"aliases": ["creek views"],
|
||||
"dirs": ["AI Jaddaf and DHCC/Creek Views"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views",
|
||||
},
|
||||
"creek-views-ii": {
|
||||
"title": "Creek Views II",
|
||||
"aliases": ["creek views ii", "creek views 2"],
|
||||
"dirs": ["AI Jaddaf and DHCC/Creek Views II"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-healthcare-city/creek-views-ii",
|
||||
},
|
||||
"azizi-riviera": {
|
||||
"title": "Azizi Riviera",
|
||||
"aliases": ["riviera"],
|
||||
"dirs": ["MBR City/Riviera", "Beno Photography/Riviera Boulevard", "Beno Photography/Riviera Lagoon"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera",
|
||||
},
|
||||
"riviera-reve": {
|
||||
"title": "Riviera Reve",
|
||||
"aliases": ["reve", "rêve"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-reve",
|
||||
},
|
||||
"riviera-beachfront": {
|
||||
"title": "Riviera Beachfront",
|
||||
"aliases": ["riviera beachfront"],
|
||||
"dirs": ["Off-Plan Projects Azizi/MBR City/Riviera Beachfront"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/meydan/riviera-beachfront",
|
||||
},
|
||||
"park-avenue": {
|
||||
"title": "Park Avenue",
|
||||
"aliases": ["park avenue"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/meydan/park-avenue",
|
||||
},
|
||||
"star": {
|
||||
"title": "Star",
|
||||
"aliases": ["star"],
|
||||
"dirs": ["Al Furjan/Star"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/star",
|
||||
},
|
||||
"roy-mediterranean": {
|
||||
"title": "ROY Mediterranean",
|
||||
"aliases": ["roy mediterranean", "roy"],
|
||||
"dirs": ["Al Furjan/Roy"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/roy-mediterranean",
|
||||
},
|
||||
"farishta": {
|
||||
"title": "Farishta",
|
||||
"aliases": ["farishta", "faristha"],
|
||||
"dirs": ["Al Furjan/Faristha"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/al-furjan/farishta",
|
||||
},
|
||||
"azizi-grand": {
|
||||
"title": "Azizi Grand",
|
||||
"aliases": ["azizi grand", "grand"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-sports-city/azizi-grand",
|
||||
},
|
||||
"beach-oasis": {
|
||||
"title": "Beach Oasis",
|
||||
"aliases": ["beach oasis"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/studio-city/beach-oasis",
|
||||
},
|
||||
"burj-azizi": {
|
||||
"title": "Burj Azizi",
|
||||
"aliases": ["burj azizi"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.burjazizi.com/",
|
||||
},
|
||||
"monaco-mansions": {
|
||||
"title": "Monaco Mansions",
|
||||
"aliases": ["monaco mansions", "monaco"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/monaco-mansions",
|
||||
},
|
||||
"azizi-venice": {
|
||||
"title": "Azizi Venice",
|
||||
"aliases": ["azizi venice", "venice"],
|
||||
"dirs": ["Dubai South/Azizi Venice"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-south/azizi-venice",
|
||||
},
|
||||
"azizi-milan": {
|
||||
"title": "Azizi Milan",
|
||||
"aliases": ["azizi milan", "milan"],
|
||||
"dirs": ["Azizi Milan"],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/azizi-milan/azizi-milan",
|
||||
},
|
||||
"azizi-wasel": {
|
||||
"title": "Azizi Wasel",
|
||||
"aliases": ["azizi wasel", "wasel"],
|
||||
"dirs": [],
|
||||
"official_url": "https://www.azizidevelopments.com/dubai/dubai-islands/azizi-wasel",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def clean_text(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
|
||||
|
||||
def json_value(value: Any) -> Any:
|
||||
if hasattr(value, "isoformat"):
|
||||
return value.isoformat()
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def is_small_facts_pdf(path: Path) -> bool:
|
||||
lower = path.name.lower()
|
||||
return path.stat().st_size <= 7_000_000 and any(token in lower for token in ["fact", "payment plan"])
|
||||
|
||||
|
||||
def extract_pdf_text(path: Path) -> str:
|
||||
try:
|
||||
reader = PdfReader(str(path))
|
||||
text = "\n".join((page.extract_text() or "") for page in reader.pages[:2])
|
||||
except Exception as exc:
|
||||
return f"[PDF extraction failed: {exc}]"
|
||||
|
||||
return clean_text(text)[:3500]
|
||||
|
||||
|
||||
def extract_xlsx(path: Path) -> list[list[Any]]:
|
||||
rows: list[list[Any]] = []
|
||||
try:
|
||||
workbook = load_workbook(path, data_only=True, read_only=True)
|
||||
except Exception as exc:
|
||||
return [["XLSX extraction failed", str(exc)]]
|
||||
|
||||
for sheet in workbook.worksheets[:2]:
|
||||
seen = 0
|
||||
for row in sheet.iter_rows():
|
||||
values = [cell.value for cell in row]
|
||||
compact = [json_value(value) for value in values if value is not None]
|
||||
if compact:
|
||||
rows.append([sheet.title, *compact[:8]])
|
||||
seen += 1
|
||||
if seen >= 28:
|
||||
break
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def file_matches(path: Path, aliases: list[str]) -> bool:
|
||||
haystack = str(path).lower()
|
||||
return any(alias.lower() in haystack for alias in aliases)
|
||||
|
||||
|
||||
def candidate_files(project: dict[str, Any], all_files: list[Path]) -> list[Path]:
|
||||
candidates: list[Path] = []
|
||||
for rel_dir in project["dirs"]:
|
||||
base = ROOT / rel_dir
|
||||
if base.exists():
|
||||
candidates.extend([path for path in base.rglob("*") if path.is_file()])
|
||||
|
||||
candidates.extend(path for path in all_files if file_matches(path, project["aliases"]))
|
||||
|
||||
unique: dict[str, Path] = {}
|
||||
for path in candidates:
|
||||
unique[str(path)] = path
|
||||
|
||||
return list(unique.values())
|
||||
|
||||
|
||||
def summarize_project(slug: str, project: dict[str, Any], all_files: list[Path]) -> dict[str, Any]:
|
||||
files = candidate_files(project, all_files)
|
||||
images = [
|
||||
str(path)
|
||||
for path in files
|
||||
if path.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"} and path.stat().st_size <= 25_000_000
|
||||
]
|
||||
xlsx_files = [path for path in files if path.suffix.lower() == ".xlsx" and path.stat().st_size <= 8_000_000]
|
||||
pdf_files = [path for path in files if path.suffix.lower() == ".pdf" and is_small_facts_pdf(path)]
|
||||
|
||||
return {
|
||||
"slug": slug,
|
||||
"title": project["title"],
|
||||
"official_url": project["official_url"],
|
||||
"source_dirs": [str(ROOT / rel_dir) for rel_dir in project["dirs"] if (ROOT / rel_dir).exists()],
|
||||
"coverage": {
|
||||
"files": len(files),
|
||||
"images": len(images),
|
||||
"xlsx": len(xlsx_files),
|
||||
"small_fact_pdfs": len(pdf_files),
|
||||
},
|
||||
"image_candidates": images[:12],
|
||||
"xlsx_extracts": [
|
||||
{
|
||||
"path": str(path),
|
||||
"rows": extract_xlsx(path),
|
||||
}
|
||||
for path in xlsx_files[:4]
|
||||
],
|
||||
"pdf_extracts": [
|
||||
{
|
||||
"path": str(path),
|
||||
"text": extract_pdf_text(path),
|
||||
}
|
||||
for path in pdf_files[:3]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
all_files = [path for path in ROOT.rglob("*") if path.is_file()]
|
||||
extraction = {
|
||||
"source_root": str(ROOT),
|
||||
"note": "Automated extraction from small factsheets, project-information spreadsheets and image filenames. Large brochures are intentionally skipped.",
|
||||
"projects": {
|
||||
slug: summarize_project(slug, project, all_files)
|
||||
for slug, project in PROJECTS.items()
|
||||
},
|
||||
}
|
||||
|
||||
OUTPUT.write_text(json.dumps(extraction, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"Wrote {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue