system-prompts-and-models-o.../personal-brand-engine/agents/opportunity_scout/scanners.py
VoXc2 4bb2442313
Add Personal Brand Engine - 7 AI Agents Automation System
Complete AI-powered personal brand automation for Sami Assiri.\n\n7 agents: LinkedIn, Email, Social Media, WhatsApp, CV Optimizer, Content Strategist, Opportunity Scout.\nInfra: FastAPI + APScheduler + Docker + Ollama/Groq LLM + GitHub Pages landing page.\n83 files, ~10K lines. Cost: $0-5/month.
2026-03-30 11:45:48 +03:00

364 lines
13 KiB
Python

"""Scanners -- free-API and RSS-based data sources for opportunity discovery.
Each scanner is an async function that returns ``list[dict]`` where every
dict has keys: title, company, url, description, source.
"""
from __future__ import annotations
import xml.etree.ElementTree as ET
from typing import Any
from urllib.parse import quote_plus
import httpx
from utils.logger import get_logger
logger = get_logger(__name__)
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
}
# Default keyword sets tailored to Sami's profile
DEFAULT_JOB_KEYWORDS: list[str] = [
"field services engineer airport security",
"Smiths Detection engineer",
"METCO field engineer Saudi",
"airport security equipment engineer",
"aviation security engineer Riyadh",
"Rapiscan field engineer",
"L3Harris security engineer Saudi",
"mechanical engineer airport Saudi Arabia",
]
DEFAULT_NEWS_KEYWORDS: list[str] = [
"Smiths Detection",
"GACA Saudi Arabia aviation",
"airport security technology Saudi",
"Riyadh airport expansion",
"Saudi Arabia aviation security",
"Nuctech airport",
"baggage screening technology",
]
# ---------------------------------------------------------------------------
# Google Jobs (via Google custom search-style scraping)
# ---------------------------------------------------------------------------
async def scan_google_jobs(
keywords: list[str] | None = None,
location: str = "Saudi Arabia",
) -> list[dict]:
"""Search for jobs via Google's public search results (RSS/HTML).
Uses the Google News RSS feed with job-related queries. This does NOT
require an API key.
"""
keywords = keywords or DEFAULT_JOB_KEYWORDS
results: list[dict] = []
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
for kw in keywords:
query = quote_plus(f"{kw} {location} jobs")
url = f"https://news.google.com/rss/search?q={query}&hl=en-SA&gl=SA&ceid=SA:en"
try:
resp = await client.get(url)
resp.raise_for_status()
entries = _parse_rss(resp.text, source="google_jobs")
results.extend(entries)
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("google_jobs_error", keyword=kw, error=str(exc))
except ET.ParseError as exc:
logger.warning("google_jobs_xml_error", keyword=kw, error=str(exc))
# Deduplicate by URL
seen: set[str] = set()
unique: list[dict] = []
for r in results:
key = r.get("url", r.get("title", ""))
if key not in seen:
seen.add(key)
unique.append(r)
logger.info("google_jobs_scan_complete", count=len(unique))
return unique
# ---------------------------------------------------------------------------
# LinkedIn (via linkedin-api library)
# ---------------------------------------------------------------------------
async def scan_linkedin_jobs_api(
linkedin_api: Any | None = None,
keywords: list[str] | None = None,
) -> list[dict]:
"""Search LinkedIn for relevant jobs using the ``linkedin-api`` library.
Parameters
----------
linkedin_api:
An authenticated ``linkedin_api.Linkedin`` instance. If ``None``,
returns an empty list (credentials not configured).
keywords:
Search terms. Defaults to Sami-relevant keywords.
"""
if linkedin_api is None:
logger.info("linkedin_api_not_configured")
return []
keywords = keywords or [
"field services engineer",
"airport security engineer",
"Smiths Detection",
"METCO",
"aviation security",
]
results: list[dict] = []
for kw in keywords:
try:
jobs = linkedin_api.search_jobs(
keywords=kw,
location_name="Saudi Arabia",
limit=10,
)
for job in jobs:
title = job.get("title", "")
company = job.get("companyName", "") or job.get("company", "")
job_id = job.get("dashEntityUrn", "") or job.get("entityUrn", "")
url = f"https://www.linkedin.com/jobs/view/{job_id.split(':')[-1]}" if job_id else ""
results.append({
"title": title,
"company": company,
"url": url,
"description": job.get("description", "")[:1000],
"source": "linkedin",
})
except Exception as exc: # noqa: BLE001
logger.warning("linkedin_search_error", keyword=kw, error=str(exc))
logger.info("linkedin_scan_complete", count=len(results))
return results
# ---------------------------------------------------------------------------
# News -- RSS feeds for industry news
# ---------------------------------------------------------------------------
_NEWS_RSS_FEEDS: list[str] = [
# Google News RSS for specific topics
"https://news.google.com/rss/search?q=Smiths+Detection&hl=en&gl=US&ceid=US:en",
"https://news.google.com/rss/search?q=airport+security+technology&hl=en&gl=SA&ceid=SA:en",
"https://news.google.com/rss/search?q=Saudi+Arabia+aviation+security&hl=en&gl=SA&ceid=SA:en",
"https://news.google.com/rss/search?q=GACA+Saudi+Arabia&hl=en&gl=SA&ceid=SA:en",
"https://news.google.com/rss/search?q=Riyadh+airport+expansion&hl=en&gl=SA&ceid=SA:en",
# Aviation security industry feeds
"https://news.google.com/rss/search?q=baggage+screening+technology&hl=en&gl=US&ceid=US:en",
]
async def scan_news(
keywords: list[str] | None = None,
) -> list[dict]:
"""Fetch industry news from RSS feeds and optional keyword searches.
Parameters
----------
keywords:
Additional keywords to search via Google News RSS. The built-in
feed list always runs regardless.
"""
keywords = keywords or DEFAULT_NEWS_KEYWORDS
results: list[dict] = []
# Build the full list of RSS URLs
urls = list(_NEWS_RSS_FEEDS)
for kw in keywords:
q = quote_plus(kw)
urls.append(
f"https://news.google.com/rss/search?q={q}&hl=en&gl=SA&ceid=SA:en"
)
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
for url in urls:
try:
resp = await client.get(url)
resp.raise_for_status()
entries = _parse_rss(resp.text, source="news")
results.extend(entries)
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("news_rss_error", url=url[:80], error=str(exc))
except ET.ParseError as exc:
logger.warning("news_xml_error", url=url[:80], error=str(exc))
# Deduplicate
seen: set[str] = set()
unique: list[dict] = []
for r in results:
key = r.get("url", r.get("title", ""))
if key not in seen:
seen.add(key)
unique.append(r)
logger.info("news_scan_complete", count=len(unique))
return unique
# ---------------------------------------------------------------------------
# Smiths Detection careers page
# ---------------------------------------------------------------------------
_SMITHS_CAREERS_URL = "https://www.smithsdetection.com/careers"
_SMITHS_JOBS_RSS = (
"https://news.google.com/rss/search?"
"q=%22Smiths+Detection%22+careers+OR+jobs+OR+hiring&hl=en&gl=US&ceid=US:en"
)
async def scan_smiths_detection_careers() -> list[dict]:
"""Check Smiths Detection for new job postings.
Since the Smiths Detection careers page may not expose a public API,
this scanner searches via Google News RSS for Smiths Detection hiring
announcements, and also attempts to fetch the careers page for links.
"""
results: list[dict] = []
async with httpx.AsyncClient(
timeout=20.0, headers=_HEADERS, follow_redirects=True
) as client:
# Approach 1: Google News RSS for Smiths Detection job postings
try:
resp = await client.get(_SMITHS_JOBS_RSS)
resp.raise_for_status()
entries = _parse_rss(resp.text, source="smiths_detection_careers")
for entry in entries:
entry["company"] = "Smiths Detection"
results.extend(entries)
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("smiths_rss_error", error=str(exc))
except ET.ParseError as exc:
logger.warning("smiths_rss_xml_error", error=str(exc))
# Approach 2: Try scraping the careers page for job listing links
try:
resp = await client.get(_SMITHS_CAREERS_URL)
resp.raise_for_status()
# Basic extraction of job-related links from HTML
_extract_career_links(resp.text, results)
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("smiths_careers_page_error", error=str(exc))
logger.info("smiths_detection_scan_complete", count=len(results))
return results
def _extract_career_links(html: str, results: list[dict]) -> None:
"""Naively extract job links from the Smiths Detection careers HTML."""
import re
# Look for links that look like job postings
pattern = re.compile(
r'<a[^>]+href="([^"]*(?:job|career|position|opening)[^"]*)"[^>]*>'
r"(.*?)</a>",
re.IGNORECASE | re.DOTALL,
)
for match in pattern.finditer(html):
url = match.group(1)
title_raw = match.group(2)
# Strip HTML tags from the title
title = re.sub(r"<[^>]+>", "", title_raw).strip()
if title and len(title) > 5:
results.append({
"title": title,
"company": "Smiths Detection",
"url": url if url.startswith("http") else f"https://www.smithsdetection.com{url}",
"description": "",
"source": "smiths_detection_careers",
})
# ---------------------------------------------------------------------------
# GACA (General Authority of Civil Aviation) announcements
# ---------------------------------------------------------------------------
_GACA_URLS = [
# Google News RSS for GACA-related announcements
"https://news.google.com/rss/search?q=GACA+Saudi+Arabia+aviation&hl=en&gl=SA&ceid=SA:en",
"https://news.google.com/rss/search?q=%22General+Authority+of+Civil+Aviation%22+Saudi&hl=en&gl=SA&ceid=SA:en",
# Arabic search
"https://news.google.com/rss/search?q=%D8%A7%D9%84%D8%B7%D9%8A%D8%B1%D8%A7%D9%86+%D8%A7%D9%84%D9%85%D8%AF%D9%86%D9%8A+%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A&hl=ar&gl=SA&ceid=SA:ar",
]
async def scan_gaca_announcements() -> list[dict]:
"""Monitor GACA (Saudi General Authority of Civil Aviation) news.
Uses Google News RSS to find announcements related to GACA, Saudi
aviation regulation, and airport security mandates.
"""
results: list[dict] = []
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
for url in _GACA_URLS:
try:
resp = await client.get(url)
resp.raise_for_status()
entries = _parse_rss(resp.text, source="gaca")
for entry in entries:
if not entry.get("company"):
entry["company"] = "GACA / Saudi Aviation"
results.extend(entries)
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("gaca_rss_error", url=url[:80], error=str(exc))
except ET.ParseError as exc:
logger.warning("gaca_xml_error", url=url[:80], error=str(exc))
# Deduplicate
seen: set[str] = set()
unique: list[dict] = []
for r in results:
key = r.get("url", r.get("title", ""))
if key not in seen:
seen.add(key)
unique.append(r)
logger.info("gaca_scan_complete", count=len(unique))
return unique
# ---------------------------------------------------------------------------
# RSS parsing helper
# ---------------------------------------------------------------------------
def _parse_rss(xml_text: str, source: str) -> list[dict]:
"""Parse an RSS 2.0 feed and return a list of opportunity dicts."""
results: list[dict] = []
root = ET.fromstring(xml_text) # noqa: S314
# RSS 2.0: /rss/channel/item
for item in root.findall(".//item"):
title = (item.findtext("title") or "").strip()
link = (item.findtext("link") or "").strip()
description = (item.findtext("description") or "").strip()
# Google News often puts the source in <source> tag
src_tag = item.find("source")
company = src_tag.text.strip() if src_tag is not None and src_tag.text else ""
if title:
results.append({
"title": title,
"company": company,
"url": link,
"description": description[:1000],
"source": source,
})
return results