"""Scanners -- free-API and RSS-based data sources for opportunity discovery. Each scanner is an async function that returns ``list[dict]`` where every dict has keys: title, company, url, description, source. """ from __future__ import annotations import xml.etree.ElementTree as ET from typing import Any from urllib.parse import quote_plus import httpx from utils.logger import get_logger logger = get_logger(__name__) _HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9,ar;q=0.8", } # Default keyword sets tailored to Sami's profile DEFAULT_JOB_KEYWORDS: list[str] = [ "field services engineer airport security", "Smiths Detection engineer", "METCO field engineer Saudi", "airport security equipment engineer", "aviation security engineer Riyadh", "Rapiscan field engineer", "L3Harris security engineer Saudi", "mechanical engineer airport Saudi Arabia", ] DEFAULT_NEWS_KEYWORDS: list[str] = [ "Smiths Detection", "GACA Saudi Arabia aviation", "airport security technology Saudi", "Riyadh airport expansion", "Saudi Arabia aviation security", "Nuctech airport", "baggage screening technology", ] # --------------------------------------------------------------------------- # Google Jobs (via Google custom search-style scraping) # --------------------------------------------------------------------------- async def scan_google_jobs( keywords: list[str] | None = None, location: str = "Saudi Arabia", ) -> list[dict]: """Search for jobs via Google's public search results (RSS/HTML). Uses the Google News RSS feed with job-related queries. This does NOT require an API key. """ keywords = keywords or DEFAULT_JOB_KEYWORDS results: list[dict] = [] async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client: for kw in keywords: query = quote_plus(f"{kw} {location} jobs") url = f"https://news.google.com/rss/search?q={query}&hl=en-SA&gl=SA&ceid=SA:en" try: resp = await client.get(url) resp.raise_for_status() entries = _parse_rss(resp.text, source="google_jobs") results.extend(entries) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("google_jobs_error", keyword=kw, error=str(exc)) except ET.ParseError as exc: logger.warning("google_jobs_xml_error", keyword=kw, error=str(exc)) # Deduplicate by URL seen: set[str] = set() unique: list[dict] = [] for r in results: key = r.get("url", r.get("title", "")) if key not in seen: seen.add(key) unique.append(r) logger.info("google_jobs_scan_complete", count=len(unique)) return unique # --------------------------------------------------------------------------- # LinkedIn (via linkedin-api library) # --------------------------------------------------------------------------- async def scan_linkedin_jobs_api( linkedin_api: Any | None = None, keywords: list[str] | None = None, ) -> list[dict]: """Search LinkedIn for relevant jobs using the ``linkedin-api`` library. Parameters ---------- linkedin_api: An authenticated ``linkedin_api.Linkedin`` instance. If ``None``, returns an empty list (credentials not configured). keywords: Search terms. Defaults to Sami-relevant keywords. """ if linkedin_api is None: logger.info("linkedin_api_not_configured") return [] keywords = keywords or [ "field services engineer", "airport security engineer", "Smiths Detection", "METCO", "aviation security", ] results: list[dict] = [] for kw in keywords: try: jobs = linkedin_api.search_jobs( keywords=kw, location_name="Saudi Arabia", limit=10, ) for job in jobs: title = job.get("title", "") company = job.get("companyName", "") or job.get("company", "") job_id = job.get("dashEntityUrn", "") or job.get("entityUrn", "") url = f"https://www.linkedin.com/jobs/view/{job_id.split(':')[-1]}" if job_id else "" results.append({ "title": title, "company": company, "url": url, "description": job.get("description", "")[:1000], "source": "linkedin", }) except Exception as exc: # noqa: BLE001 logger.warning("linkedin_search_error", keyword=kw, error=str(exc)) logger.info("linkedin_scan_complete", count=len(results)) return results # --------------------------------------------------------------------------- # News -- RSS feeds for industry news # --------------------------------------------------------------------------- _NEWS_RSS_FEEDS: list[str] = [ # Google News RSS for specific topics "https://news.google.com/rss/search?q=Smiths+Detection&hl=en&gl=US&ceid=US:en", "https://news.google.com/rss/search?q=airport+security+technology&hl=en&gl=SA&ceid=SA:en", "https://news.google.com/rss/search?q=Saudi+Arabia+aviation+security&hl=en&gl=SA&ceid=SA:en", "https://news.google.com/rss/search?q=GACA+Saudi+Arabia&hl=en&gl=SA&ceid=SA:en", "https://news.google.com/rss/search?q=Riyadh+airport+expansion&hl=en&gl=SA&ceid=SA:en", # Aviation security industry feeds "https://news.google.com/rss/search?q=baggage+screening+technology&hl=en&gl=US&ceid=US:en", ] async def scan_news( keywords: list[str] | None = None, ) -> list[dict]: """Fetch industry news from RSS feeds and optional keyword searches. Parameters ---------- keywords: Additional keywords to search via Google News RSS. The built-in feed list always runs regardless. """ keywords = keywords or DEFAULT_NEWS_KEYWORDS results: list[dict] = [] # Build the full list of RSS URLs urls = list(_NEWS_RSS_FEEDS) for kw in keywords: q = quote_plus(kw) urls.append( f"https://news.google.com/rss/search?q={q}&hl=en&gl=SA&ceid=SA:en" ) async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client: for url in urls: try: resp = await client.get(url) resp.raise_for_status() entries = _parse_rss(resp.text, source="news") results.extend(entries) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("news_rss_error", url=url[:80], error=str(exc)) except ET.ParseError as exc: logger.warning("news_xml_error", url=url[:80], error=str(exc)) # Deduplicate seen: set[str] = set() unique: list[dict] = [] for r in results: key = r.get("url", r.get("title", "")) if key not in seen: seen.add(key) unique.append(r) logger.info("news_scan_complete", count=len(unique)) return unique # --------------------------------------------------------------------------- # Smiths Detection careers page # --------------------------------------------------------------------------- _SMITHS_CAREERS_URL = "https://www.smithsdetection.com/careers" _SMITHS_JOBS_RSS = ( "https://news.google.com/rss/search?" "q=%22Smiths+Detection%22+careers+OR+jobs+OR+hiring&hl=en&gl=US&ceid=US:en" ) async def scan_smiths_detection_careers() -> list[dict]: """Check Smiths Detection for new job postings. Since the Smiths Detection careers page may not expose a public API, this scanner searches via Google News RSS for Smiths Detection hiring announcements, and also attempts to fetch the careers page for links. """ results: list[dict] = [] async with httpx.AsyncClient( timeout=20.0, headers=_HEADERS, follow_redirects=True ) as client: # Approach 1: Google News RSS for Smiths Detection job postings try: resp = await client.get(_SMITHS_JOBS_RSS) resp.raise_for_status() entries = _parse_rss(resp.text, source="smiths_detection_careers") for entry in entries: entry["company"] = "Smiths Detection" results.extend(entries) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("smiths_rss_error", error=str(exc)) except ET.ParseError as exc: logger.warning("smiths_rss_xml_error", error=str(exc)) # Approach 2: Try scraping the careers page for job listing links try: resp = await client.get(_SMITHS_CAREERS_URL) resp.raise_for_status() # Basic extraction of job-related links from HTML _extract_career_links(resp.text, results) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("smiths_careers_page_error", error=str(exc)) logger.info("smiths_detection_scan_complete", count=len(results)) return results def _extract_career_links(html: str, results: list[dict]) -> None: """Naively extract job links from the Smiths Detection careers HTML.""" import re # Look for links that look like job postings pattern = re.compile( r']+href="([^"]*(?:job|career|position|opening)[^"]*)"[^>]*>' r"(.*?)", re.IGNORECASE | re.DOTALL, ) for match in pattern.finditer(html): url = match.group(1) title_raw = match.group(2) # Strip HTML tags from the title title = re.sub(r"<[^>]+>", "", title_raw).strip() if title and len(title) > 5: results.append({ "title": title, "company": "Smiths Detection", "url": url if url.startswith("http") else f"https://www.smithsdetection.com{url}", "description": "", "source": "smiths_detection_careers", }) # --------------------------------------------------------------------------- # GACA (General Authority of Civil Aviation) announcements # --------------------------------------------------------------------------- _GACA_URLS = [ # Google News RSS for GACA-related announcements "https://news.google.com/rss/search?q=GACA+Saudi+Arabia+aviation&hl=en&gl=SA&ceid=SA:en", "https://news.google.com/rss/search?q=%22General+Authority+of+Civil+Aviation%22+Saudi&hl=en&gl=SA&ceid=SA:en", # Arabic search "https://news.google.com/rss/search?q=%D8%A7%D9%84%D8%B7%D9%8A%D8%B1%D8%A7%D9%86+%D8%A7%D9%84%D9%85%D8%AF%D9%86%D9%8A+%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A&hl=ar&gl=SA&ceid=SA:ar", ] async def scan_gaca_announcements() -> list[dict]: """Monitor GACA (Saudi General Authority of Civil Aviation) news. Uses Google News RSS to find announcements related to GACA, Saudi aviation regulation, and airport security mandates. """ results: list[dict] = [] async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client: for url in _GACA_URLS: try: resp = await client.get(url) resp.raise_for_status() entries = _parse_rss(resp.text, source="gaca") for entry in entries: if not entry.get("company"): entry["company"] = "GACA / Saudi Aviation" results.extend(entries) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("gaca_rss_error", url=url[:80], error=str(exc)) except ET.ParseError as exc: logger.warning("gaca_xml_error", url=url[:80], error=str(exc)) # Deduplicate seen: set[str] = set() unique: list[dict] = [] for r in results: key = r.get("url", r.get("title", "")) if key not in seen: seen.add(key) unique.append(r) logger.info("gaca_scan_complete", count=len(unique)) return unique # --------------------------------------------------------------------------- # RSS parsing helper # --------------------------------------------------------------------------- def _parse_rss(xml_text: str, source: str) -> list[dict]: """Parse an RSS 2.0 feed and return a list of opportunity dicts.""" results: list[dict] = [] root = ET.fromstring(xml_text) # noqa: S314 # RSS 2.0: /rss/channel/item for item in root.findall(".//item"): title = (item.findtext("title") or "").strip() link = (item.findtext("link") or "").strip() description = (item.findtext("description") or "").strip() # Google News often puts the source in tag src_tag = item.find("source") company = src_tag.text.strip() if src_tag is not None and src_tag.text else "" if title: results.append({ "title": title, "company": company, "url": link, "description": description[:1000], "source": source, }) return results