feat: Add tool to estimate date of a webpage
All checks were successful
Build Docker / BuildImage (push) Successful in 2m34s

This commit is contained in:
2025-10-16 16:48:26 +11:00
parent 7f591e2724
commit 57a4b977ec
2 changed files with 286 additions and 50 deletions

View File

@@ -2,16 +2,25 @@ from flask import Blueprint, request, jsonify
import os
import datetime
import requests
import re
from mail import sendEmail
from tools import getClientIP, getGitCommit, json_response
from tools import getClientIP, getGitCommit, json_response, parse_date
from blueprints.sol import sol_bp
from dateutil import parser as date_parser
# Constants
HTTP_OK = 200
HTTP_BAD_REQUEST = 400
HTTP_UNAUTHORIZED = 401
HTTP_NOT_FOUND = 404
HTTP_UNSUPPORTED_MEDIA = 415
HTTP_SERVER_ERROR = 500
api_bp = Blueprint('api', __name__)
# Register solana blueprint
api_bp.register_blueprint(sol_bp)
# Load configuration
NC_CONFIG = requests.get(
"https://cloud.woodburn.au/s/4ToXgFe3TnnFcN7/download/website-conf.json"
).json()
@@ -23,6 +32,7 @@ if 'time-zone' not in NC_CONFIG:
@api_bp.route("/")
@api_bp.route("/help")
def help():
"""Provide API documentation and help."""
return jsonify({
"message": "Welcome to Nathan.Woodburn/ API! This is a personal website. For more information, visit https://nathan.woodburn.au",
"endpoints": {
@@ -37,71 +47,78 @@ def help():
"base_url": "/api/v1",
"version": getGitCommit(),
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/version")
def version():
"""Get the current version of the website."""
return jsonify({"version": getGitCommit()})
@api_bp.route("/time")
def time():
"""Get the current time in the configured timezone."""
timezone_offset = datetime.timedelta(hours=NC_CONFIG["time-zone"])
timezone = datetime.timezone(offset=timezone_offset)
time = datetime.datetime.now(tz=timezone)
current_time = datetime.datetime.now(tz=timezone)
return jsonify({
"timestring": time.strftime("%A, %B %d, %Y %I:%M %p"),
"timestamp": time.timestamp(),
"timestring": current_time.strftime("%A, %B %d, %Y %I:%M %p"),
"timestamp": current_time.timestamp(),
"timezone": NC_CONFIG["time-zone"],
"timeISO": time.isoformat(),
"timeISO": current_time.isoformat(),
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/timezone")
def timezone():
"""Get the current timezone setting."""
return jsonify({
"timezone": NC_CONFIG["time-zone"],
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/message")
def message():
"""Get the message from the configuration."""
return jsonify({
"message": NC_CONFIG["message"],
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/ip")
def ip():
"""Get the client's IP address."""
return jsonify({
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/email", methods=["POST"])
def email_post():
"""Send an email via the API (requires API key)."""
# Verify json
if not request.is_json:
return json_response(request, "415 Unsupported Media Type", 415)
return json_response(request, "415 Unsupported Media Type", HTTP_UNSUPPORTED_MEDIA)
# Check if api key sent
data = request.json
if not data:
return json_response(request, "400 Bad Request", 400)
return json_response(request, "400 Bad Request", HTTP_BAD_REQUEST)
if "key" not in data:
return json_response(request, "400 Bad Request 'key' missing", 400)
return json_response(request, "400 Bad Request 'key' missing", HTTP_BAD_REQUEST)
if data["key"] != os.getenv("EMAIL_KEY"):
return json_response(request, "401 Unauthorized", 401)
return json_response(request, "401 Unauthorized", HTTP_UNAUTHORIZED)
# TODO: Add client info to email
return sendEmail(data)
@@ -109,6 +126,7 @@ def email_post():
@api_bp.route("/project")
def project():
"""Get information about the current git project."""
gitinfo = {
"website": None,
}
@@ -129,12 +147,125 @@ def project():
gitinfo["website"] = git["repo"]["website"]
except Exception as e:
print(f"Error getting git data: {e}")
return json_response(request, "500 Internal Server Error", 500)
return json_response(request, "500 Internal Server Error", HTTP_SERVER_ERROR)
return jsonify({
"repo_name": repo_name,
"repo_description": repo_description,
"repo": gitinfo,
"ip": getClientIP(request),
"status": 200
"status": HTTP_OK
})
@api_bp.route("/page_date")
def page_date():
url = request.args.get("url")
if not url:
return json_response(request, "400 Bad Request 'url' missing", HTTP_BAD_REQUEST)
verbose = request.args.get("verbose", "").lower() in ["true", "1", "yes", "y"]
if not url.startswith(("https://", "http://")):
return json_response(request, "400 Bad Request 'url' invalid", HTTP_BAD_REQUEST)
try:
r = requests.get(url, timeout=5)
r.raise_for_status()
except requests.exceptions.RequestException as e:
return json_response(request, f"400 Bad Request 'url' unreachable: {e}", HTTP_BAD_REQUEST)
page_text = r.text
# Remove ordinal suffixes globally
page_text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', page_text, flags=re.IGNORECASE)
# Remove HTML comments
page_text = re.sub(r'<!--.*?-->', '', page_text, flags=re.DOTALL)
date_patterns = [
r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})', # YYYY-MM-DD
r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', # DD-MM-YYYY
r'(?:Last updated:|Updated:|Updated last:)?\s*(\d{1,2})\s+([A-Za-z]{3,9})[, ]?\s*(\d{4})', # DD Month YYYY
r'(?:\b\w+\b\s+){0,3}([A-Za-z]{3,9})\s+(\d{1,2}),?\s*(\d{4})', # Month DD, YYYY with optional words
r'\b(\d{4})(\d{2})(\d{2})\b', # YYYYMMDD
r'(?:Last updated:|Updated:|Last update)?\s*([A-Za-z]{3,9})\s+(\d{4})', # Month YYYY only
]
# Structured data patterns
json_date_patterns = {
r'"datePublished"\s*:\s*"([^"]+)"': "published",
r'"dateModified"\s*:\s*"([^"]+)"': "modified",
r'<meta\s+(?:[^>]*?)property\s*=\s*"article:published_time"\s+content\s*=\s*"([^"]+)"': "published",
r'<meta\s+(?:[^>]*?)property\s*=\s*"article:modified_time"\s+content\s*=\s*"([^"]+)"': "modified",
r'<time\s+datetime\s*=\s*"([^"]+)"': "published"
}
found_dates = []
# Extract content dates
for idx, pattern in enumerate(date_patterns):
for match in re.findall(pattern, page_text):
if not match:
continue
groups = match[-3:] # last three elements
found_dates.append([groups, idx, "content"])
# Extract structured data dates
for pattern, date_type in json_date_patterns.items():
for match in re.findall(pattern, page_text):
try:
dt = date_parser.isoparse(match)
formatted_date = dt.strftime('%Y-%m-%d')
found_dates.append([[formatted_date], -1, date_type])
except (ValueError, TypeError):
continue
if not found_dates:
return json_response(request, "Date not found on page", HTTP_BAD_REQUEST)
today = datetime.date.today()
tolerance_date = today + datetime.timedelta(days=1) # Allow for slight future dates (e.g., time zones)
# When processing dates
processed_dates = []
for date_groups, pattern_format, date_type in found_dates:
if pattern_format == -1:
# Already formatted date
try:
dt = datetime.datetime.strptime(date_groups[0], "%Y-%m-%d").date()
except ValueError:
continue
else:
parsed_date = parse_date(date_groups)
if not parsed_date:
continue
dt = datetime.datetime.strptime(parsed_date, "%Y-%m-%d").date()
# Only keep dates in the past (with tolerance)
if dt <= tolerance_date:
date_obj = {"date": dt.strftime("%Y-%m-%d"), "type": date_type}
if verbose:
if pattern_format == -1:
date_obj.update({"source": "metadata", "pattern_used": pattern_format, "raw": date_groups[0]})
else:
date_obj.update({"source": "content", "pattern_used": pattern_format, "raw": " ".join(date_groups)})
processed_dates.append(date_obj)
if not processed_dates:
if verbose:
return jsonify({
"message": "No valid dates found on page",
"found_dates": found_dates,
"processed_dates": processed_dates
}), HTTP_BAD_REQUEST
return json_response(request, "No valid dates found on page", HTTP_BAD_REQUEST)
# Sort dates and return latest
processed_dates.sort(key=lambda x: x["date"])
latest = processed_dates[-1]
response = {"latest": latest["date"], "type": latest["type"]}
if verbose:
response["dates"] = processed_dates
return json_response(request, response, HTTP_OK)

165
tools.py
View File

@@ -1,18 +1,44 @@
from flask import Request, render_template, jsonify, make_response
import os
from functools import cache
import datetime
from typing import Optional, Dict, Union, Tuple
import re
from dateutil.parser import parse
# HTTP status codes
HTTP_OK = 200
HTTP_BAD_REQUEST = 400
HTTP_NOT_FOUND = 404
def getClientIP(request):
def getClientIP(request: Request) -> str:
"""
Get the client's IP address from the request.
Args:
request (Request): The Flask request object
Returns:
str: The client's IP address
"""
x_forwarded_for = request.headers.get("X-Forwarded-For")
if x_forwarded_for:
ip = x_forwarded_for.split(",")[0]
else:
ip = request.remote_addr
if ip is None:
ip = "unknown"
return ip
def getGitCommit():
def getGitCommit() -> str:
"""
Get the current git commit hash.
Returns:
str: The current git commit hash or a failure message
"""
# if .git exists, get the latest commit hash
if os.path.isdir(".git"):
git_dir = ".git"
@@ -35,77 +61,156 @@ def getGitCommit():
def isCurl(request: Request) -> bool:
"""
Check if the request is from curl or hurl
Check if the request is from curl or hurl.
Args:
request (Request): The Flask request object
Returns:
bool: True if the request is from curl, False otherwise
Returns:
bool: True if the request is from curl or hurl, False otherwise
"""
if request.headers and request.headers.get("User-Agent"):
# Check if curl
if "curl" in request.headers.get("User-Agent", ""):
return True
if "hurl" in request.headers.get("User-Agent",""):
return True
user_agent = request.headers.get("User-Agent", "")
return "curl" in user_agent or "hurl" in user_agent
return False
def isCrawler(request: Request) -> bool:
"""
Check if the request is from a web crawler (e.g., Googlebot, Bingbot)
Check if the request is from a web crawler (e.g., Googlebot, Bingbot).
Args:
request (Request): The Flask request object
Returns:
bool: True if the request is from a web crawler, False otherwise
"""
if request.headers and request.headers.get("User-Agent"):
# Check if Googlebot or Bingbot
if "Googlebot" in request.headers.get(
"User-Agent", ""
) or "Bingbot" in request.headers.get("User-Agent", ""):
return True
user_agent = request.headers.get("User-Agent", "")
return "Googlebot" in user_agent or "Bingbot" in user_agent
return False
@cache
def getAddress(coin: str) -> str:
"""
Get the wallet address for a cryptocurrency.
Args:
coin (str): The cryptocurrency code
Returns:
str: The wallet address or empty string if not found
"""
address = ""
if os.path.isfile(".well-known/wallets/" + coin.upper()):
with open(".well-known/wallets/" + coin.upper()) as file:
wallet_path = f".well-known/wallets/{coin.upper()}"
if os.path.isfile(wallet_path):
with open(wallet_path) as file:
address = file.read()
return address
def getFilePath(name, path):
def getFilePath(name: str, path: str) -> Optional[str]:
"""
Find a file in a directory tree.
Args:
name (str): The filename to find
path (str): The root directory to search
Returns:
Optional[str]: The full path to the file or None if not found
"""
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
return None
def json_response(request: Request, message: str = "404 Not Found", code: int = 404):
return jsonify(
{
def json_response(request: Request, message: Union[str, Dict] = "404 Not Found", code: int = 404):
"""
Create a JSON response with standard formatting.
Args:
request (Request): The Flask request object
message (Union[str, Dict]): The response message or data
code (int): The HTTP status code
Returns:
Tuple[Dict, int]: The JSON response and HTTP status code
"""
if isinstance(message, dict):
# Add status and ip to dict
message["status"] = code
message["ip"] = getClientIP(request)
return jsonify(message), code
return jsonify({
"status": code,
"message": message,
"ip": getClientIP(request),
}
), code
}), code
def error_response(request: Request, message: str = "404 Not Found", code: int = 404, force_json: bool = False):
def error_response(
request: Request,
message: str = "404 Not Found",
code: int = 404,
force_json: bool = False
) -> Union[Tuple[Dict, int], object]:
"""
Create an error response in JSON or HTML format.
Args:
request (Request): The Flask request object
message (str): The error message
code (int): The HTTP status code
force_json (bool): Whether to force JSON response regardless of client
Returns:
Union[Tuple[Dict, int], object]: The JSON or HTML response
"""
if force_json or isCurl(request):
return json_response(request, message, code)
# Check if <error code>.html exists in templates
template_name = f"{code}.html" if os.path.isfile(f"templates/{code}.html") else "404.html"
response = make_response(render_template(
"404.html", code=code, message=message), code)
if os.path.isfile(f"templates/{code}.html"):
response = make_response(render_template(
f"{code}.html", code=code, message=message), code)
template_name, code=code, message=message), code)
# Add message to response headers
response.headers["X-Error-Message"] = message
return response
def parse_date(date_groups: list[str]) -> str | None:
"""
Parse a list of date components into YYYY-MM-DD format.
Uses dateutil.parser for robust parsing.
Works for:
- DD Month YYYY
- Month DD, YYYY
- YYYY-MM-DD
- YYYYMMDD
- Month YYYY (defaults day to 1)
- Handles ordinal suffixes (st, nd, rd, th)
"""
try:
# Join date groups into a single string
date_str = " ".join(date_groups).strip()
# Remove ordinal suffixes
date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
# Parse with dateutil, default day=1 if missing
dt = parse(date_str, default=datetime.datetime(1900, 1, 1))
# If year is missing, parse will fallback to 1900 → reject
if dt.year == 1900:
return None
return dt.strftime("%Y-%m-%d")
except (ValueError, TypeError):
return None