feat: Add tool to estimate date of a webpage
All checks were successful
Build Docker / BuildImage (push) Successful in 2m34s

This commit is contained in:
2025-10-16 16:48:26 +11:00
parent 7f591e2724
commit 57a4b977ec
2 changed files with 286 additions and 50 deletions

View File

@@ -2,16 +2,25 @@ from flask import Blueprint, request, jsonify
import os import os
import datetime import datetime
import requests import requests
import re
from mail import sendEmail from mail import sendEmail
from tools import getClientIP, getGitCommit, json_response from tools import getClientIP, getGitCommit, json_response, parse_date
from blueprints.sol import sol_bp from blueprints.sol import sol_bp
from dateutil import parser as date_parser
# Constants
HTTP_OK = 200
HTTP_BAD_REQUEST = 400
HTTP_UNAUTHORIZED = 401
HTTP_NOT_FOUND = 404
HTTP_UNSUPPORTED_MEDIA = 415
HTTP_SERVER_ERROR = 500
api_bp = Blueprint('api', __name__) api_bp = Blueprint('api', __name__)
# Register solana blueprint # Register solana blueprint
api_bp.register_blueprint(sol_bp) api_bp.register_blueprint(sol_bp)
# Load configuration
NC_CONFIG = requests.get( NC_CONFIG = requests.get(
"https://cloud.woodburn.au/s/4ToXgFe3TnnFcN7/download/website-conf.json" "https://cloud.woodburn.au/s/4ToXgFe3TnnFcN7/download/website-conf.json"
).json() ).json()
@@ -23,6 +32,7 @@ if 'time-zone' not in NC_CONFIG:
@api_bp.route("/") @api_bp.route("/")
@api_bp.route("/help") @api_bp.route("/help")
def help(): def help():
"""Provide API documentation and help."""
return jsonify({ return jsonify({
"message": "Welcome to Nathan.Woodburn/ API! This is a personal website. For more information, visit https://nathan.woodburn.au", "message": "Welcome to Nathan.Woodburn/ API! This is a personal website. For more information, visit https://nathan.woodburn.au",
"endpoints": { "endpoints": {
@@ -37,71 +47,78 @@ def help():
"base_url": "/api/v1", "base_url": "/api/v1",
"version": getGitCommit(), "version": getGitCommit(),
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/version") @api_bp.route("/version")
def version(): def version():
"""Get the current version of the website."""
return jsonify({"version": getGitCommit()}) return jsonify({"version": getGitCommit()})
@api_bp.route("/time") @api_bp.route("/time")
def time(): def time():
"""Get the current time in the configured timezone."""
timezone_offset = datetime.timedelta(hours=NC_CONFIG["time-zone"]) timezone_offset = datetime.timedelta(hours=NC_CONFIG["time-zone"])
timezone = datetime.timezone(offset=timezone_offset) timezone = datetime.timezone(offset=timezone_offset)
time = datetime.datetime.now(tz=timezone) current_time = datetime.datetime.now(tz=timezone)
return jsonify({ return jsonify({
"timestring": time.strftime("%A, %B %d, %Y %I:%M %p"), "timestring": current_time.strftime("%A, %B %d, %Y %I:%M %p"),
"timestamp": time.timestamp(), "timestamp": current_time.timestamp(),
"timezone": NC_CONFIG["time-zone"], "timezone": NC_CONFIG["time-zone"],
"timeISO": time.isoformat(), "timeISO": current_time.isoformat(),
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/timezone") @api_bp.route("/timezone")
def timezone(): def timezone():
"""Get the current timezone setting."""
return jsonify({ return jsonify({
"timezone": NC_CONFIG["time-zone"], "timezone": NC_CONFIG["time-zone"],
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/message") @api_bp.route("/message")
def message(): def message():
"""Get the message from the configuration."""
return jsonify({ return jsonify({
"message": NC_CONFIG["message"], "message": NC_CONFIG["message"],
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/ip") @api_bp.route("/ip")
def ip(): def ip():
"""Get the client's IP address."""
return jsonify({ return jsonify({
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/email", methods=["POST"]) @api_bp.route("/email", methods=["POST"])
def email_post(): def email_post():
"""Send an email via the API (requires API key)."""
# Verify json # Verify json
if not request.is_json: if not request.is_json:
return json_response(request, "415 Unsupported Media Type", 415) return json_response(request, "415 Unsupported Media Type", HTTP_UNSUPPORTED_MEDIA)
# Check if api key sent # Check if api key sent
data = request.json data = request.json
if not data: if not data:
return json_response(request, "400 Bad Request", 400) return json_response(request, "400 Bad Request", HTTP_BAD_REQUEST)
if "key" not in data: if "key" not in data:
return json_response(request, "400 Bad Request 'key' missing", 400) return json_response(request, "400 Bad Request 'key' missing", HTTP_BAD_REQUEST)
if data["key"] != os.getenv("EMAIL_KEY"): if data["key"] != os.getenv("EMAIL_KEY"):
return json_response(request, "401 Unauthorized", 401) return json_response(request, "401 Unauthorized", HTTP_UNAUTHORIZED)
# TODO: Add client info to email # TODO: Add client info to email
return sendEmail(data) return sendEmail(data)
@@ -109,6 +126,7 @@ def email_post():
@api_bp.route("/project") @api_bp.route("/project")
def project(): def project():
"""Get information about the current git project."""
gitinfo = { gitinfo = {
"website": None, "website": None,
} }
@@ -129,12 +147,125 @@ def project():
gitinfo["website"] = git["repo"]["website"] gitinfo["website"] = git["repo"]["website"]
except Exception as e: except Exception as e:
print(f"Error getting git data: {e}") print(f"Error getting git data: {e}")
return json_response(request, "500 Internal Server Error", 500) return json_response(request, "500 Internal Server Error", HTTP_SERVER_ERROR)
return jsonify({ return jsonify({
"repo_name": repo_name, "repo_name": repo_name,
"repo_description": repo_description, "repo_description": repo_description,
"repo": gitinfo, "repo": gitinfo,
"ip": getClientIP(request), "ip": getClientIP(request),
"status": 200 "status": HTTP_OK
}) })
@api_bp.route("/page_date")
def page_date():
url = request.args.get("url")
if not url:
return json_response(request, "400 Bad Request 'url' missing", HTTP_BAD_REQUEST)
verbose = request.args.get("verbose", "").lower() in ["true", "1", "yes", "y"]
if not url.startswith(("https://", "http://")):
return json_response(request, "400 Bad Request 'url' invalid", HTTP_BAD_REQUEST)
try:
r = requests.get(url, timeout=5)
r.raise_for_status()
except requests.exceptions.RequestException as e:
return json_response(request, f"400 Bad Request 'url' unreachable: {e}", HTTP_BAD_REQUEST)
page_text = r.text
# Remove ordinal suffixes globally
page_text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', page_text, flags=re.IGNORECASE)
# Remove HTML comments
page_text = re.sub(r'<!--.*?-->', '', page_text, flags=re.DOTALL)
date_patterns = [
r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})', # YYYY-MM-DD
r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', # DD-MM-YYYY
r'(?:Last updated:|Updated:|Updated last:)?\s*(\d{1,2})\s+([A-Za-z]{3,9})[, ]?\s*(\d{4})', # DD Month YYYY
r'(?:\b\w+\b\s+){0,3}([A-Za-z]{3,9})\s+(\d{1,2}),?\s*(\d{4})', # Month DD, YYYY with optional words
r'\b(\d{4})(\d{2})(\d{2})\b', # YYYYMMDD
r'(?:Last updated:|Updated:|Last update)?\s*([A-Za-z]{3,9})\s+(\d{4})', # Month YYYY only
]
# Structured data patterns
json_date_patterns = {
r'"datePublished"\s*:\s*"([^"]+)"': "published",
r'"dateModified"\s*:\s*"([^"]+)"': "modified",
r'<meta\s+(?:[^>]*?)property\s*=\s*"article:published_time"\s+content\s*=\s*"([^"]+)"': "published",
r'<meta\s+(?:[^>]*?)property\s*=\s*"article:modified_time"\s+content\s*=\s*"([^"]+)"': "modified",
r'<time\s+datetime\s*=\s*"([^"]+)"': "published"
}
found_dates = []
# Extract content dates
for idx, pattern in enumerate(date_patterns):
for match in re.findall(pattern, page_text):
if not match:
continue
groups = match[-3:] # last three elements
found_dates.append([groups, idx, "content"])
# Extract structured data dates
for pattern, date_type in json_date_patterns.items():
for match in re.findall(pattern, page_text):
try:
dt = date_parser.isoparse(match)
formatted_date = dt.strftime('%Y-%m-%d')
found_dates.append([[formatted_date], -1, date_type])
except (ValueError, TypeError):
continue
if not found_dates:
return json_response(request, "Date not found on page", HTTP_BAD_REQUEST)
today = datetime.date.today()
tolerance_date = today + datetime.timedelta(days=1) # Allow for slight future dates (e.g., time zones)
# When processing dates
processed_dates = []
for date_groups, pattern_format, date_type in found_dates:
if pattern_format == -1:
# Already formatted date
try:
dt = datetime.datetime.strptime(date_groups[0], "%Y-%m-%d").date()
except ValueError:
continue
else:
parsed_date = parse_date(date_groups)
if not parsed_date:
continue
dt = datetime.datetime.strptime(parsed_date, "%Y-%m-%d").date()
# Only keep dates in the past (with tolerance)
if dt <= tolerance_date:
date_obj = {"date": dt.strftime("%Y-%m-%d"), "type": date_type}
if verbose:
if pattern_format == -1:
date_obj.update({"source": "metadata", "pattern_used": pattern_format, "raw": date_groups[0]})
else:
date_obj.update({"source": "content", "pattern_used": pattern_format, "raw": " ".join(date_groups)})
processed_dates.append(date_obj)
if not processed_dates:
if verbose:
return jsonify({
"message": "No valid dates found on page",
"found_dates": found_dates,
"processed_dates": processed_dates
}), HTTP_BAD_REQUEST
return json_response(request, "No valid dates found on page", HTTP_BAD_REQUEST)
# Sort dates and return latest
processed_dates.sort(key=lambda x: x["date"])
latest = processed_dates[-1]
response = {"latest": latest["date"], "type": latest["type"]}
if verbose:
response["dates"] = processed_dates
return json_response(request, response, HTTP_OK)

171
tools.py
View File

@@ -1,18 +1,44 @@
from flask import Request, render_template, jsonify, make_response from flask import Request, render_template, jsonify, make_response
import os import os
from functools import cache from functools import cache
import datetime
from typing import Optional, Dict, Union, Tuple
import re
from dateutil.parser import parse
# HTTP status codes
HTTP_OK = 200
HTTP_BAD_REQUEST = 400
HTTP_NOT_FOUND = 404
def getClientIP(request): def getClientIP(request: Request) -> str:
"""
Get the client's IP address from the request.
Args:
request (Request): The Flask request object
Returns:
str: The client's IP address
"""
x_forwarded_for = request.headers.get("X-Forwarded-For") x_forwarded_for = request.headers.get("X-Forwarded-For")
if x_forwarded_for: if x_forwarded_for:
ip = x_forwarded_for.split(",")[0] ip = x_forwarded_for.split(",")[0]
else: else:
ip = request.remote_addr ip = request.remote_addr
if ip is None:
ip = "unknown"
return ip return ip
def getGitCommit(): def getGitCommit() -> str:
"""
Get the current git commit hash.
Returns:
str: The current git commit hash or a failure message
"""
# if .git exists, get the latest commit hash # if .git exists, get the latest commit hash
if os.path.isdir(".git"): if os.path.isdir(".git"):
git_dir = ".git" git_dir = ".git"
@@ -35,77 +61,156 @@ def getGitCommit():
def isCurl(request: Request) -> bool: def isCurl(request: Request) -> bool:
""" """
Check if the request is from curl or hurl Check if the request is from curl or hurl.
Args: Args:
request (Request): The Flask request object request (Request): The Flask request object
Returns:
bool: True if the request is from curl, False otherwise
Returns:
bool: True if the request is from curl or hurl, False otherwise
""" """
if request.headers and request.headers.get("User-Agent"): if request.headers and request.headers.get("User-Agent"):
# Check if curl user_agent = request.headers.get("User-Agent", "")
if "curl" in request.headers.get("User-Agent", ""): return "curl" in user_agent or "hurl" in user_agent
return True
if "hurl" in request.headers.get("User-Agent",""):
return True
return False return False
def isCrawler(request: Request) -> bool: def isCrawler(request: Request) -> bool:
""" """
Check if the request is from a web crawler (e.g., Googlebot, Bingbot) Check if the request is from a web crawler (e.g., Googlebot, Bingbot).
Args: Args:
request (Request): The Flask request object request (Request): The Flask request object
Returns: Returns:
bool: True if the request is from a web crawler, False otherwise bool: True if the request is from a web crawler, False otherwise
""" """
if request.headers and request.headers.get("User-Agent"): if request.headers and request.headers.get("User-Agent"):
# Check if Googlebot or Bingbot user_agent = request.headers.get("User-Agent", "")
if "Googlebot" in request.headers.get( return "Googlebot" in user_agent or "Bingbot" in user_agent
"User-Agent", ""
) or "Bingbot" in request.headers.get("User-Agent", ""):
return True
return False return False
@cache @cache
def getAddress(coin: str) -> str: def getAddress(coin: str) -> str:
"""
Get the wallet address for a cryptocurrency.
Args:
coin (str): The cryptocurrency code
Returns:
str: The wallet address or empty string if not found
"""
address = "" address = ""
if os.path.isfile(".well-known/wallets/" + coin.upper()): wallet_path = f".well-known/wallets/{coin.upper()}"
with open(".well-known/wallets/" + coin.upper()) as file: if os.path.isfile(wallet_path):
with open(wallet_path) as file:
address = file.read() address = file.read()
return address return address
def getFilePath(name, path): def getFilePath(name: str, path: str) -> Optional[str]:
"""
Find a file in a directory tree.
Args:
name (str): The filename to find
path (str): The root directory to search
Returns:
Optional[str]: The full path to the file or None if not found
"""
for root, dirs, files in os.walk(path): for root, dirs, files in os.walk(path):
if name in files: if name in files:
return os.path.join(root, name) return os.path.join(root, name)
return None
def json_response(request: Request, message: str = "404 Not Found", code: int = 404): def json_response(request: Request, message: Union[str, Dict] = "404 Not Found", code: int = 404):
return jsonify( """
{ Create a JSON response with standard formatting.
"status": code,
"message": message, Args:
"ip": getClientIP(request), request (Request): The Flask request object
} message (Union[str, Dict]): The response message or data
), code code (int): The HTTP status code
Returns:
Tuple[Dict, int]: The JSON response and HTTP status code
"""
if isinstance(message, dict):
# Add status and ip to dict
message["status"] = code
message["ip"] = getClientIP(request)
return jsonify(message), code
return jsonify({
"status": code,
"message": message,
"ip": getClientIP(request),
}), code
def error_response(request: Request, message: str = "404 Not Found", code: int = 404, force_json: bool = False): def error_response(
request: Request,
message: str = "404 Not Found",
code: int = 404,
force_json: bool = False
) -> Union[Tuple[Dict, int], object]:
"""
Create an error response in JSON or HTML format.
Args:
request (Request): The Flask request object
message (str): The error message
code (int): The HTTP status code
force_json (bool): Whether to force JSON response regardless of client
Returns:
Union[Tuple[Dict, int], object]: The JSON or HTML response
"""
if force_json or isCurl(request): if force_json or isCurl(request):
return json_response(request, message, code) return json_response(request, message, code)
# Check if <error code>.html exists in templates # Check if <error code>.html exists in templates
template_name = f"{code}.html" if os.path.isfile(f"templates/{code}.html") else "404.html"
response = make_response(render_template( response = make_response(render_template(
"404.html", code=code, message=message), code) template_name, code=code, message=message), code)
if os.path.isfile(f"templates/{code}.html"):
response = make_response(render_template(
f"{code}.html", code=code, message=message), code)
# Add message to response headers # Add message to response headers
response.headers["X-Error-Message"] = message response.headers["X-Error-Message"] = message
return response return response
def parse_date(date_groups: list[str]) -> str | None:
"""
Parse a list of date components into YYYY-MM-DD format.
Uses dateutil.parser for robust parsing.
Works for:
- DD Month YYYY
- Month DD, YYYY
- YYYY-MM-DD
- YYYYMMDD
- Month YYYY (defaults day to 1)
- Handles ordinal suffixes (st, nd, rd, th)
"""
try:
# Join date groups into a single string
date_str = " ".join(date_groups).strip()
# Remove ordinal suffixes
date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
# Parse with dateutil, default day=1 if missing
dt = parse(date_str, default=datetime.datetime(1900, 1, 1))
# If year is missing, parse will fallback to 1900 → reject
if dt.year == 1900:
return None
return dt.strftime("%Y-%m-%d")
except (ValueError, TypeError):
return None