mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 15:29:36 +00:00
Complete AI-powered personal brand automation for Sami Assiri.\n\n7 agents: LinkedIn, Email, Social Media, WhatsApp, CV Optimizer, Content Strategist, Opportunity Scout.\nInfra: FastAPI + APScheduler + Docker + Ollama/Groq LLM + GitHub Pages landing page.\n83 files, ~10K lines. Cost: $0-5/month.
364 lines
13 KiB
Python
364 lines
13 KiB
Python
"""Scanners -- free-API and RSS-based data sources for opportunity discovery.
|
|
|
|
Each scanner is an async function that returns ``list[dict]`` where every
|
|
dict has keys: title, company, url, description, source.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import xml.etree.ElementTree as ET
|
|
from typing import Any
|
|
from urllib.parse import quote_plus
|
|
|
|
import httpx
|
|
|
|
from utils.logger import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
|
|
}
|
|
|
|
# Default keyword sets tailored to Sami's profile
|
|
DEFAULT_JOB_KEYWORDS: list[str] = [
|
|
"field services engineer airport security",
|
|
"Smiths Detection engineer",
|
|
"METCO field engineer Saudi",
|
|
"airport security equipment engineer",
|
|
"aviation security engineer Riyadh",
|
|
"Rapiscan field engineer",
|
|
"L3Harris security engineer Saudi",
|
|
"mechanical engineer airport Saudi Arabia",
|
|
]
|
|
|
|
DEFAULT_NEWS_KEYWORDS: list[str] = [
|
|
"Smiths Detection",
|
|
"GACA Saudi Arabia aviation",
|
|
"airport security technology Saudi",
|
|
"Riyadh airport expansion",
|
|
"Saudi Arabia aviation security",
|
|
"Nuctech airport",
|
|
"baggage screening technology",
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Google Jobs (via Google custom search-style scraping)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def scan_google_jobs(
|
|
keywords: list[str] | None = None,
|
|
location: str = "Saudi Arabia",
|
|
) -> list[dict]:
|
|
"""Search for jobs via Google's public search results (RSS/HTML).
|
|
|
|
Uses the Google News RSS feed with job-related queries. This does NOT
|
|
require an API key.
|
|
"""
|
|
keywords = keywords or DEFAULT_JOB_KEYWORDS
|
|
results: list[dict] = []
|
|
|
|
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
|
|
for kw in keywords:
|
|
query = quote_plus(f"{kw} {location} jobs")
|
|
url = f"https://news.google.com/rss/search?q={query}&hl=en-SA&gl=SA&ceid=SA:en"
|
|
try:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
entries = _parse_rss(resp.text, source="google_jobs")
|
|
results.extend(entries)
|
|
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
logger.warning("google_jobs_error", keyword=kw, error=str(exc))
|
|
except ET.ParseError as exc:
|
|
logger.warning("google_jobs_xml_error", keyword=kw, error=str(exc))
|
|
|
|
# Deduplicate by URL
|
|
seen: set[str] = set()
|
|
unique: list[dict] = []
|
|
for r in results:
|
|
key = r.get("url", r.get("title", ""))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
|
|
logger.info("google_jobs_scan_complete", count=len(unique))
|
|
return unique
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LinkedIn (via linkedin-api library)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def scan_linkedin_jobs_api(
|
|
linkedin_api: Any | None = None,
|
|
keywords: list[str] | None = None,
|
|
) -> list[dict]:
|
|
"""Search LinkedIn for relevant jobs using the ``linkedin-api`` library.
|
|
|
|
Parameters
|
|
----------
|
|
linkedin_api:
|
|
An authenticated ``linkedin_api.Linkedin`` instance. If ``None``,
|
|
returns an empty list (credentials not configured).
|
|
keywords:
|
|
Search terms. Defaults to Sami-relevant keywords.
|
|
"""
|
|
if linkedin_api is None:
|
|
logger.info("linkedin_api_not_configured")
|
|
return []
|
|
|
|
keywords = keywords or [
|
|
"field services engineer",
|
|
"airport security engineer",
|
|
"Smiths Detection",
|
|
"METCO",
|
|
"aviation security",
|
|
]
|
|
|
|
results: list[dict] = []
|
|
for kw in keywords:
|
|
try:
|
|
jobs = linkedin_api.search_jobs(
|
|
keywords=kw,
|
|
location_name="Saudi Arabia",
|
|
limit=10,
|
|
)
|
|
for job in jobs:
|
|
title = job.get("title", "")
|
|
company = job.get("companyName", "") or job.get("company", "")
|
|
job_id = job.get("dashEntityUrn", "") or job.get("entityUrn", "")
|
|
url = f"https://www.linkedin.com/jobs/view/{job_id.split(':')[-1]}" if job_id else ""
|
|
results.append({
|
|
"title": title,
|
|
"company": company,
|
|
"url": url,
|
|
"description": job.get("description", "")[:1000],
|
|
"source": "linkedin",
|
|
})
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("linkedin_search_error", keyword=kw, error=str(exc))
|
|
|
|
logger.info("linkedin_scan_complete", count=len(results))
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# News -- RSS feeds for industry news
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_NEWS_RSS_FEEDS: list[str] = [
|
|
# Google News RSS for specific topics
|
|
"https://news.google.com/rss/search?q=Smiths+Detection&hl=en&gl=US&ceid=US:en",
|
|
"https://news.google.com/rss/search?q=airport+security+technology&hl=en&gl=SA&ceid=SA:en",
|
|
"https://news.google.com/rss/search?q=Saudi+Arabia+aviation+security&hl=en&gl=SA&ceid=SA:en",
|
|
"https://news.google.com/rss/search?q=GACA+Saudi+Arabia&hl=en&gl=SA&ceid=SA:en",
|
|
"https://news.google.com/rss/search?q=Riyadh+airport+expansion&hl=en&gl=SA&ceid=SA:en",
|
|
# Aviation security industry feeds
|
|
"https://news.google.com/rss/search?q=baggage+screening+technology&hl=en&gl=US&ceid=US:en",
|
|
]
|
|
|
|
|
|
async def scan_news(
|
|
keywords: list[str] | None = None,
|
|
) -> list[dict]:
|
|
"""Fetch industry news from RSS feeds and optional keyword searches.
|
|
|
|
Parameters
|
|
----------
|
|
keywords:
|
|
Additional keywords to search via Google News RSS. The built-in
|
|
feed list always runs regardless.
|
|
"""
|
|
keywords = keywords or DEFAULT_NEWS_KEYWORDS
|
|
results: list[dict] = []
|
|
|
|
# Build the full list of RSS URLs
|
|
urls = list(_NEWS_RSS_FEEDS)
|
|
for kw in keywords:
|
|
q = quote_plus(kw)
|
|
urls.append(
|
|
f"https://news.google.com/rss/search?q={q}&hl=en&gl=SA&ceid=SA:en"
|
|
)
|
|
|
|
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
|
|
for url in urls:
|
|
try:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
entries = _parse_rss(resp.text, source="news")
|
|
results.extend(entries)
|
|
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
logger.warning("news_rss_error", url=url[:80], error=str(exc))
|
|
except ET.ParseError as exc:
|
|
logger.warning("news_xml_error", url=url[:80], error=str(exc))
|
|
|
|
# Deduplicate
|
|
seen: set[str] = set()
|
|
unique: list[dict] = []
|
|
for r in results:
|
|
key = r.get("url", r.get("title", ""))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
|
|
logger.info("news_scan_complete", count=len(unique))
|
|
return unique
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Smiths Detection careers page
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_SMITHS_CAREERS_URL = "https://www.smithsdetection.com/careers"
|
|
_SMITHS_JOBS_RSS = (
|
|
"https://news.google.com/rss/search?"
|
|
"q=%22Smiths+Detection%22+careers+OR+jobs+OR+hiring&hl=en&gl=US&ceid=US:en"
|
|
)
|
|
|
|
|
|
async def scan_smiths_detection_careers() -> list[dict]:
|
|
"""Check Smiths Detection for new job postings.
|
|
|
|
Since the Smiths Detection careers page may not expose a public API,
|
|
this scanner searches via Google News RSS for Smiths Detection hiring
|
|
announcements, and also attempts to fetch the careers page for links.
|
|
"""
|
|
results: list[dict] = []
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=20.0, headers=_HEADERS, follow_redirects=True
|
|
) as client:
|
|
# Approach 1: Google News RSS for Smiths Detection job postings
|
|
try:
|
|
resp = await client.get(_SMITHS_JOBS_RSS)
|
|
resp.raise_for_status()
|
|
entries = _parse_rss(resp.text, source="smiths_detection_careers")
|
|
for entry in entries:
|
|
entry["company"] = "Smiths Detection"
|
|
results.extend(entries)
|
|
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
logger.warning("smiths_rss_error", error=str(exc))
|
|
except ET.ParseError as exc:
|
|
logger.warning("smiths_rss_xml_error", error=str(exc))
|
|
|
|
# Approach 2: Try scraping the careers page for job listing links
|
|
try:
|
|
resp = await client.get(_SMITHS_CAREERS_URL)
|
|
resp.raise_for_status()
|
|
# Basic extraction of job-related links from HTML
|
|
_extract_career_links(resp.text, results)
|
|
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
logger.warning("smiths_careers_page_error", error=str(exc))
|
|
|
|
logger.info("smiths_detection_scan_complete", count=len(results))
|
|
return results
|
|
|
|
|
|
def _extract_career_links(html: str, results: list[dict]) -> None:
|
|
"""Naively extract job links from the Smiths Detection careers HTML."""
|
|
import re
|
|
|
|
# Look for links that look like job postings
|
|
pattern = re.compile(
|
|
r'<a[^>]+href="([^"]*(?:job|career|position|opening)[^"]*)"[^>]*>'
|
|
r"(.*?)</a>",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
for match in pattern.finditer(html):
|
|
url = match.group(1)
|
|
title_raw = match.group(2)
|
|
# Strip HTML tags from the title
|
|
title = re.sub(r"<[^>]+>", "", title_raw).strip()
|
|
if title and len(title) > 5:
|
|
results.append({
|
|
"title": title,
|
|
"company": "Smiths Detection",
|
|
"url": url if url.startswith("http") else f"https://www.smithsdetection.com{url}",
|
|
"description": "",
|
|
"source": "smiths_detection_careers",
|
|
})
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# GACA (General Authority of Civil Aviation) announcements
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_GACA_URLS = [
|
|
# Google News RSS for GACA-related announcements
|
|
"https://news.google.com/rss/search?q=GACA+Saudi+Arabia+aviation&hl=en&gl=SA&ceid=SA:en",
|
|
"https://news.google.com/rss/search?q=%22General+Authority+of+Civil+Aviation%22+Saudi&hl=en&gl=SA&ceid=SA:en",
|
|
# Arabic search
|
|
"https://news.google.com/rss/search?q=%D8%A7%D9%84%D8%B7%D9%8A%D8%B1%D8%A7%D9%86+%D8%A7%D9%84%D9%85%D8%AF%D9%86%D9%8A+%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A&hl=ar&gl=SA&ceid=SA:ar",
|
|
]
|
|
|
|
|
|
async def scan_gaca_announcements() -> list[dict]:
|
|
"""Monitor GACA (Saudi General Authority of Civil Aviation) news.
|
|
|
|
Uses Google News RSS to find announcements related to GACA, Saudi
|
|
aviation regulation, and airport security mandates.
|
|
"""
|
|
results: list[dict] = []
|
|
|
|
async with httpx.AsyncClient(timeout=20.0, headers=_HEADERS) as client:
|
|
for url in _GACA_URLS:
|
|
try:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
entries = _parse_rss(resp.text, source="gaca")
|
|
for entry in entries:
|
|
if not entry.get("company"):
|
|
entry["company"] = "GACA / Saudi Aviation"
|
|
results.extend(entries)
|
|
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
logger.warning("gaca_rss_error", url=url[:80], error=str(exc))
|
|
except ET.ParseError as exc:
|
|
logger.warning("gaca_xml_error", url=url[:80], error=str(exc))
|
|
|
|
# Deduplicate
|
|
seen: set[str] = set()
|
|
unique: list[dict] = []
|
|
for r in results:
|
|
key = r.get("url", r.get("title", ""))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
|
|
logger.info("gaca_scan_complete", count=len(unique))
|
|
return unique
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RSS parsing helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _parse_rss(xml_text: str, source: str) -> list[dict]:
|
|
"""Parse an RSS 2.0 feed and return a list of opportunity dicts."""
|
|
results: list[dict] = []
|
|
root = ET.fromstring(xml_text) # noqa: S314
|
|
|
|
# RSS 2.0: /rss/channel/item
|
|
for item in root.findall(".//item"):
|
|
title = (item.findtext("title") or "").strip()
|
|
link = (item.findtext("link") or "").strip()
|
|
description = (item.findtext("description") or "").strip()
|
|
# Google News often puts the source in <source> tag
|
|
src_tag = item.find("source")
|
|
company = src_tag.text.strip() if src_tag is not None and src_tag.text else ""
|
|
|
|
if title:
|
|
results.append({
|
|
"title": title,
|
|
"company": company,
|
|
"url": link,
|
|
"description": description[:1000],
|
|
"source": source,
|
|
})
|
|
|
|
return results
|