feat: Add status check speedups
All checks were successful
Build Docker / BuildImage (push) Successful in 59s

This commit is contained in:
2025-06-13 23:43:41 +10:00
parent bbc3801a41
commit f936973b8d
3 changed files with 473 additions and 231 deletions

12
main.py
View File

@@ -1,13 +1,13 @@
import time import time
import signal import signal
import threading import threading
import concurrent.futures
from flask import Flask from flask import Flask
from server import app from server import app, node_check_executor
import server import server
from gunicorn.app.base import BaseApplication from gunicorn.app.base import BaseApplication
import os import os
import dotenv import dotenv
import concurrent.futures
import schedule import schedule
@@ -59,6 +59,10 @@ def signal_handler(sig, frame):
print("Shutting down gracefully...", flush=True) print("Shutting down gracefully...", flush=True)
stop_event.set() stop_event.set()
# Shutdown the node check executor
print("Shutting down thread pools...", flush=True)
node_check_executor.shutdown(wait=False)
if __name__ == '__main__': if __name__ == '__main__':
dotenv.load_dotenv() dotenv.load_dotenv()
@@ -79,6 +83,10 @@ if __name__ == '__main__':
finally: finally:
stop_event.set() stop_event.set()
scheduler_future.cancel() scheduler_future.cancel()
# Make sure to shut down node check executor
node_check_executor.shutdown(wait=False)
try: try:
scheduler_future.result(timeout=5) scheduler_future.result(timeout=5)
except concurrent.futures.CancelledError: except concurrent.futures.CancelledError:

516
server.py
View File

@@ -38,11 +38,22 @@ import functools
import io import io
import brotli import brotli
from io import BytesIO from io import BytesIO
import concurrent.futures
from threading import Lock
# Set up logging BEFORE attempting imports that might fail # Set up logging BEFORE attempting imports that might fail
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Set up ThreadPoolExecutor for parallel node checking
# Use a reasonable number of workers based on CPU cores
node_check_executor = concurrent.futures.ThreadPoolExecutor(
max_workers=min(32, os.cpu_count() * 4) # Max 32 workers or 4x CPU cores
)
# Create a lock for thread safety when updating cache
cache_lock = Lock()
dotenv.load_dotenv() dotenv.load_dotenv()
# Configure caching # Configure caching
@@ -207,7 +218,9 @@ def retry(max_attempts=3, delay_seconds=1):
while attempts < max_attempts: while attempts < max_attempts:
try: try:
return func(*args, **kwargs) return func(*args, **kwargs)
except (socket.timeout, socket.error, dns.exception.Timeout, requests.exceptions.RequestException) as e: except (socket.timeout, socket.error, dns.exception.Timeout,
requests.exceptions.RequestException, ConnectionRefusedError,
ConnectionResetError, OSError, ssl.SSLError) as e:
attempts += 1 attempts += 1
last_error = e last_error = e
logger.warning(f"Attempt {attempts} failed with error: {e} - retrying in {delay_seconds} seconds") logger.warning(f"Attempt {attempts} failed with error: {e} - retrying in {delay_seconds} seconds")
@@ -219,12 +232,13 @@ def retry(max_attempts=3, delay_seconds=1):
return decorator return decorator
# Optimize socket timeout settings
@retry(max_attempts=3, delay_seconds=2) @retry(max_attempts=3, delay_seconds=2)
def check_plain_dns(ip: str) -> bool: def check_plain_dns(ip: str) -> bool:
resolver = dns.resolver.Resolver() resolver = dns.resolver.Resolver()
resolver.nameservers = [ip] resolver.nameservers = [ip]
resolver.timeout = 5 # Set a reasonable timeout resolver.timeout = 3 # Reduced from 5 seconds to 3 seconds
resolver.lifetime = 5 # Total timeout for the query resolver.lifetime = 3 # Reduced from 5 seconds to 3 seconds
try: try:
result = resolver.resolve("1.wdbrn", "TXT") result = resolver.resolve("1.wdbrn", "TXT")
@@ -273,13 +287,13 @@ def check_doh(ip: str) -> dict:
) )
wireframe_request = request.encode() + dns_query wireframe_request = request.encode() + dns_query
# Create socket with timeout # Create socket with reduced timeout
sock = socket.create_connection((ip, 443), timeout=10) sock = socket.create_connection((ip, 443), timeout=5) # Reduced from 10 to 5 seconds
context = ssl.create_default_context() context = ssl.create_default_context()
context.check_hostname = False # Skip hostname verification for IP-based connection context.check_hostname = False # Skip hostname verification for IP-based connection
ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com") ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com")
ssock.settimeout(10) # Set a timeout for socket operations ssock.settimeout(5) # Reduced from 10 to 5 seconds
ssock.sendall(wireframe_request) ssock.sendall(wireframe_request)
response_data = b"" response_data = b""
@@ -354,7 +368,7 @@ def check_dot(ip: str) -> bool:
q = dns.message.make_query(qname, dns.rdatatype.TXT) q = dns.message.make_query(qname, dns.rdatatype.TXT)
try: try:
response = dns.query.tls( response = dns.query.tls(
q, ip, timeout=5, port=853, server_hostname="hnsdoh.com" q, ip, timeout=3, port=853, server_hostname="hnsdoh.com" # Reduced from 5 to 3 seconds
) )
if response.rcode() == dns.rcode.NOERROR: if response.rcode() == dns.rcode.NOERROR:
for rrset in response.answer: for rrset in response.answer:
@@ -382,12 +396,12 @@ def verify_cert(ip: str, port: int) -> dict:
ssock = None ssock = None
try: try:
sock = socket.create_connection((ip, port), timeout=10) sock = socket.create_connection((ip, port), timeout=5) # Reduced from 10 to 5 seconds
# Wrap the socket in SSL/TLS # Wrap the socket in SSL/TLS
context = ssl.create_default_context() context = ssl.create_default_context()
context.check_hostname = False # Skip hostname verification for IP-based connection context.check_hostname = False # Skip hostname verification for IP-based connection
ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com") ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com")
ssock.settimeout(10) # Set timeout for socket operations ssock.settimeout(5) # Reduced from 10 to 5 seconds
# Retrieve the server's certificate # Retrieve the server's certificate
cert = ssock.getpeercert() cert = ssock.getpeercert()
@@ -469,7 +483,7 @@ def format_last_check(last_log: datetime) -> str:
def check_nodes() -> list: def check_nodes() -> list:
global nodes global nodes, _node_status_cache, _node_status_cache_time
if last_log > datetime.now() - relativedelta.relativedelta(minutes=1): if last_log > datetime.now() - relativedelta.relativedelta(minutes=1):
# Load the last log # Load the last log
with open(f"{log_dir}/node_status.json", "r") as file: with open(f"{log_dir}/node_status.json", "r") as file:
@@ -487,53 +501,43 @@ def check_nodes() -> list:
if len(nodes) == 0: if len(nodes) == 0:
nodes = get_node_list() nodes = get_node_list()
# Use ThreadPoolExecutor to check nodes in parallel
futures = {}
node_status = [] node_status = []
for ip in nodes:
logger.info(f"Checking node {ip}")
try:
plain_dns_result = check_plain_dns(ip)
doh_check = check_doh(ip)
dot_result = check_dot(ip)
cert_result = verify_cert(ip, 443)
cert_853_result = verify_cert(ip, 853)
node_status.append( # Submit all node checks to the executor
{ for ip in nodes:
"ip": ip, futures[node_check_executor.submit(check_single_node, ip)] = ip
"name": node_names[ip] if ip in node_names else ip,
"location": ( # Collect results as they complete
node_locations[ip] if ip in node_locations else "Unknown" for future in concurrent.futures.as_completed(futures):
), ip = futures[future]
"plain_dns": plain_dns_result, try:
"doh": doh_check["status"], node_result = future.result()
"doh_server": doh_check["server"], node_status.append(node_result)
"dot": dot_result,
"cert": cert_result,
"cert_853": cert_853_result,
}
)
logger.info(f"Node {ip} check complete")
except Exception as e: except Exception as e:
logger.error(f"Error checking node {ip}: {e}") logger.error(f"Error processing results for node {ip}: {e}")
# Add a failed entry for this node to ensure it's still included # Ensure a failed node entry is still included
node_status.append( node_status.append({
{
"ip": ip, "ip": ip,
"name": node_names[ip] if ip in node_names else ip, "name": node_names[ip] if ip in node_names else ip,
"location": ( "location": (node_locations[ip] if ip in node_locations else "Unknown"),
node_locations[ip] if ip in node_locations else "Unknown"
),
"plain_dns": False, "plain_dns": False,
"doh": False, "doh": False,
"doh_server": [], "doh_server": [],
"dot": False, "dot": False,
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}, "cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}, "cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
} })
)
# Save the node status to a file # Save the node status to a file
log_status(node_status) log_status(node_status)
# Update the in-memory cache with thread safety
with cache_lock:
_node_status_cache = node_status
_node_status_cache_time = datetime.now()
logger.info("Finished checking nodes") logger.info("Finished checking nodes")
# Send notifications if any nodes are down # Send notifications if any nodes are down
@@ -567,156 +571,84 @@ def check_nodes() -> list:
return node_status return node_status
# Optimize check_nodes_from_log function with in-memory caching def check_single_node(ip):
def check_nodes_from_log() -> list: """Check a single node and return its status."""
global last_log, _node_status_cache, _node_status_cache_time logger.info(f"Checking node {ip}")
try:
# Add timeout handling for individual checks
plain_dns_result = False
doh_result = {"status": False, "server": []}
dot_result = False
cert_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
cert_853_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
# Check if we have a valid cache # Use timeout to limit time spent on each check
current_time = datetime.now() with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
staleness_threshold_str = os.getenv("STALENESS_THRESHOLD_MINUTES", "15") future_plain_dns = executor.submit(check_plain_dns, ip)
future_doh = executor.submit(check_doh, ip)
future_dot = executor.submit(check_dot, ip)
future_cert = executor.submit(verify_cert, ip, 443)
future_cert_853 = executor.submit(verify_cert, ip, 853)
# Collect results with timeout
try:
plain_dns_result = future_plain_dns.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Plain DNS check timed out for {ip}: {str(e)}")
try: try:
staleness_threshold = int(staleness_threshold_str) doh_result = future_doh.result(timeout=5)
except ValueError: except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Invalid STALENESS_THRESHOLD_MINUTES value: {staleness_threshold_str}") logger.warning(f"DoH check timed out for {ip}: {str(e)}")
staleness_threshold = 15
# Use in-memory cache if it's fresh enough
if (_node_status_cache is not None and _node_status_cache_time is not None and
current_time < _node_status_cache_time + relativedelta.relativedelta(minutes=staleness_threshold/2)):
logger.info(f"Using in-memory cache from {format_last_check(_node_status_cache_time)}")
return _node_status_cache
# Otherwise load from disk or run a new check
try: try:
with open(f"{log_dir}/node_status.json", "r") as file: dot_result = future_dot.result(timeout=5)
data = json.load(file) except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"DoT check timed out for {ip}: {str(e)}")
newest = { try:
"date": datetime.now() - relativedelta.relativedelta(years=1), cert_result = future_cert.result(timeout=5)
"nodes": [], except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Cert check timed out for {ip}: {str(e)}")
try:
cert_853_result = future_cert_853.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Cert 853 check timed out for {ip}: {str(e)}")
node_status = {
"ip": ip,
"name": node_names[ip] if ip in node_names else ip,
"location": (
node_locations[ip] if ip in node_locations else "Unknown"
),
"plain_dns": plain_dns_result,
"doh": doh_result["status"],
"doh_server": doh_result["server"],
"dot": dot_result,
"cert": cert_result,
"cert_853": cert_853_result,
} }
logger.info(f"Node {ip} check complete")
for entry in data:
if datetime.strptime(entry["date"], "%Y-%m-%d %H:%M:%S") > newest["date"]:
newest = entry
newest["date"] = datetime.strptime(newest["date"], "%Y-%m-%d %H:%M:%S")
node_status = newest["nodes"]
if current_time > newest["date"] + relativedelta.relativedelta(minutes=staleness_threshold):
logger.warning(f"Data is stale (older than {staleness_threshold} minutes), triggering immediate check")
node_status = check_nodes()
else:
last_log = newest["date"]
logger.info(f"Using cached node status from {format_last_check(last_log)}")
# Update the in-memory cache
_node_status_cache = node_status
_node_status_cache_time = current_time
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Error reading node status file: {e}")
logger.info("Running initial node check")
node_status = check_nodes()
# Update the in-memory cache
_node_status_cache = node_status
_node_status_cache_time = current_time
return node_status return node_status
except Exception as e:
logger.error(f"Error checking node {ip}: {e}")
def send_notification(title, description, author): # Add a failed entry for this node to ensure it's still included
discord_hook = os.getenv("DISCORD_HOOK") return {
if discord_hook: "ip": ip,
data = { "name": node_names[ip] if ip in node_names else ip,
"content": "", "location": (
"embeds": [ node_locations[ip] if ip in node_locations else "Unknown"
{ ),
"title": title, "plain_dns": False,
"description": description, "doh": False,
"url": "https://status.hnsdoh.com", "doh_server": [],
"color": 5814783, "dot": False,
"author": { "cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
"name": author, "cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
"icon_url": "https://status.hnsdoh.com/favicon.png",
},
} }
],
"username": "HNSDoH",
"avatar_url": "https://status.hnsdoh.com/favicon.png",
"attachments": [],
}
response = requests.post(discord_hook, json=data)
print("Sent notification", flush=True)
else:
print("No discord hook", flush=True)
def send_down_notification(node):
global sent_notifications
# Check if a notification has already been sent
if node["ip"] not in sent_notifications:
sent_notifications[node["ip"]] = datetime.strftime(
datetime.now(), "%Y-%m-%d %H:%M:%S"
)
else:
last_send = datetime.strptime(
sent_notifications[node["ip"]], "%Y-%m-%d %H:%M:%S"
)
if last_send > datetime.now() - relativedelta.relativedelta(hours=1):
print(
f"Notification already sent for {node['name']} in the last hr",
flush=True,
)
return
# Only send certain notifications once per day
if node["plain_dns"] and node["doh"] and node["dot"]:
if last_send > datetime.now() - relativedelta.relativedelta(days=1):
print(
f"Notification already sent for {node['name']} in the last day",
flush=True,
)
return
# Save the notification to the file
sent_notifications[node["ip"]] = datetime.strftime(
datetime.now(), "%Y-%m-%d %H:%M:%S"
)
with open(f"{log_dir}/sent_notifications.json", "w") as file:
json.dump(sent_notifications, file, indent=4)
title = f"{node['name']} is down"
description = f"{node['name']} ({node['ip']}) is down with the following issues:\n"
if not node["plain_dns"]:
description += "- Plain DNS is down\n"
if not node["doh"]:
description += "- DoH is down\n"
if not node["dot"]:
description += "- DoT is down\n"
if not node["cert"]["valid"]:
description += "- Certificate on port 443 is invalid\n"
if not node["cert_853"]["valid"]:
description += "- Certificate on port 853 is invalid\n"
if node["plain_dns"] and node["doh"] and node["dot"]:
if node["cert"]["valid"] and node["cert_853"]["valid"]:
description = f"The certificate on {node['name']} ({node['ip']}) is expiring soon\n"
title = f"{node['name']} certificate is expiring soon"
# Also add the expiry date of the certificates
description += "\nCertificate expiry dates:\n"
description += f"- Certificate on port 443 expires {node['cert']['expires']}\n"
description += f"- Certificate on port 853 expires {node['cert_853']['expires']}\n"
send_notification(title, description, node["name"])
# endregion
# region File logs # region File logs
@@ -786,9 +718,9 @@ def create_default_node_dict():
"name": "", "name": "",
"location": "", "location": "",
"ip": "", "ip": "",
"plain_dns": {"last_down": "Never", "percentage": 0}, "plain_dns": {"last_down": "never", "percentage": 0},
"doh": {"last_down": "Never", "percentage": 0}, "doh": {"last_down": "never", "percentage": 0},
"dot": {"last_down": "Never", "percentage": 0}, "dot": {"last_down": "never", "percentage": 0},
} }
def create_default_counts_dict(): def create_default_counts_dict():
@@ -804,9 +736,9 @@ def summarize_history(history: list) -> dict:
nodes_status = defaultdict(create_default_node_dict) nodes_status = defaultdict(create_default_node_dict)
overall_status = { overall_status = {
"plain_dns": {"last_down": "Never", "percentage": 0}, "plain_dns": {"last_down": "never", "percentage": 0},
"doh": {"last_down": "Never", "percentage": 0}, "doh": {"last_down": "never", "percentage": 0},
"dot": {"last_down": "Never", "percentage": 0}, "dot": {"last_down": "never", "percentage": 0},
} }
# Collect data # Collect data
@@ -834,7 +766,7 @@ def summarize_history(history: list) -> dict:
for key in ["plain_dns", "doh", "dot"]: for key in ["plain_dns", "doh", "dot"]:
if node.get(key) == False: if node.get(key) == False:
# Check if the last downtime is more recent # Check if the last downtime is more recent
if nodes_status[ip][key]["last_down"] == "Never": if nodes_status[ip][key]["last_down"] == "never":
nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S") nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S")
elif date > datetime.strptime(nodes_status[ip][key]["last_down"], "%Y-%m-%d %H:%M:%S"): elif date > datetime.strptime(nodes_status[ip][key]["last_down"], "%Y-%m-%d %H:%M:%S"):
nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S") nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S")
@@ -875,7 +807,7 @@ def summarize_history(history: list) -> dict:
last_downs = [ last_downs = [
nodes_status[ip][key]["last_down"] nodes_status[ip][key]["last_down"]
for ip in nodes_status for ip in nodes_status
if nodes_status[ip][key]["last_down"] != "Never" if nodes_status[ip][key]["last_down"] != "never"
] ]
if last_downs: if last_downs:
overall_status[key]["last_down"] = max(last_downs) overall_status[key]["last_down"] = max(last_downs)
@@ -961,7 +893,7 @@ def api_index():
# Cache node status for API requests # Cache node status for API requests
@app.route("/api/nodes") @app.route("/api/nodes")
@cache.cached(timeout=60) # Cache for 1 minute @cache.cached(timeout=300) # Increased from 60s to 5 minutes
def api_nodes(): def api_nodes():
node_status = check_nodes_from_log() node_status = check_nodes_from_log()
return jsonify(node_status) return jsonify(node_status)
@@ -1118,6 +1050,14 @@ def api_errors():
@app.route("/api/check/<ip>") @app.route("/api/check/<ip>")
@cache.cached(timeout=30) # Cache for 30 seconds @cache.cached(timeout=30) # Cache for 30 seconds
def api_check(ip: str): def api_check(ip: str):
# Verify IP is one of the nodes
global nodes
if not nodes:
return jsonify({"error": "No nodes available"}), 404
if ip not in nodes:
return jsonify({"error": f"Node {ip} not found"}), 404
logger.info(f"Checking node {ip}") logger.info(f"Checking node {ip}")
data = { data = {
"ip": ip, "ip": ip,
@@ -1150,8 +1090,19 @@ def api_check(ip: str):
# region Main routes # region Main routes
# Cache the main page rendering # Cache the main page rendering
@app.route("/") @app.route("/")
@cache.cached(timeout=60, query_string=True) # Cache for 1 minute, respect query params @cache.cached(timeout=120, query_string=True) # Increased from 60s to 2 minutes
def index(): def index():
# Check for fast_load parameter to provide a quicker initial page load
fast_load = request.args.get('fast_load', 'false').lower() == 'true'
if fast_load:
# Return a minimal template that will load data via JavaScript
return render_template(
"index_fast.html",
api_url=request.url_root + "api"
)
# Original slower but complete load
node_status = check_nodes_from_log() node_status = check_nodes_from_log()
alerts = [] alerts = []
@@ -1218,7 +1169,7 @@ def index():
# Convert time to relative time # Convert time to relative time
for node in history_summary["nodes"]: for node in history_summary["nodes"]:
for key in ["plain_dns", "doh", "dot"]: for key in ["plain_dns", "doh", "dot"]:
if node[key]["last_down"] == "Never": if node[key]["last_down"] == "never":
node[key]["last_down"] = "over 30 days ago" node[key]["last_down"] = "over 30 days ago"
else: else:
node[key]["last_down"] = format_last_check( node[key]["last_down"] = format_last_check(
@@ -1226,7 +1177,7 @@ def index():
) )
for key in ["plain_dns", "doh", "dot"]: for key in ["plain_dns", "doh", "dot"]:
if history_summary["overall"][key]["last_down"] == "Never": if history_summary["overall"][key]["last_down"] == "never":
continue continue
history_summary["overall"][key]["last_down"] = format_last_check( history_summary["overall"][key]["last_down"] = format_last_check(
datetime.strptime(history_summary["overall"][key]["last_down"], "%Y-%m-%d %H:%M:%S") datetime.strptime(history_summary["overall"][key]["last_down"], "%Y-%m-%d %H:%M:%S")
@@ -1307,16 +1258,12 @@ def scheduled_node_check():
global nodes, _node_status_cache, _node_status_cache_time global nodes, _node_status_cache, _node_status_cache_time
nodes = [] # Reset node list to force refresh nodes = [] # Reset node list to force refresh
# Run the check and update in-memory cache # Run the check (which now uses ThreadPoolExecutor)
node_status = check_nodes() node_status = check_nodes()
_node_status_cache = node_status
_node_status_cache_time = datetime.now()
# Clear relevant caches # Clear relevant caches
cache.delete_memoized(api_nodes) cache.delete_memoized(api_nodes)
cache.delete_memoized(api_errors) cache.delete_memoized(api_errors)
cache.delete_memoized(index) cache.delete_memoized(index)
logger.info("Completed scheduled node check and updated caches") logger.info("Completed scheduled node check and updated caches")
except Exception as e: except Exception as e:
logger.error(f"Error in scheduled node check: {e}") logger.error(f"Error in scheduled node check: {e}")
@@ -1339,7 +1286,6 @@ def start_scheduler():
check_interval = 5 check_interval = 5
logger.info(f"Setting up scheduler to run every {check_interval} minutes") logger.info(f"Setting up scheduler to run every {check_interval} minutes")
# Add the job to the scheduler # Add the job to the scheduler
scheduler.add_job( scheduler.add_job(
scheduled_node_check, scheduled_node_check,
@@ -1347,10 +1293,9 @@ def start_scheduler():
id='node_check_job', id='node_check_job',
replace_existing=True replace_existing=True
) )
logger.info(f"Setting up scheduler to run every {check_interval} minutes")
# Add listener for job events # Add listener for job events
scheduler.add_listener(scheduler_listener, EVENT_JOB_ERROR | EVENT_JOB_EXECUTED) scheduler.add_listener(scheduler_listener, EVENT_JOB_ERROR | EVENT_JOB_EXECUTED)
# Start the scheduler if it's not already running # Start the scheduler if it's not already running
if not scheduler.running: if not scheduler.running:
scheduler.start() scheduler.start()
@@ -1364,10 +1309,6 @@ def signal_handler(sig, frame):
logger.info("Scheduler shut down") logger.info("Scheduler shut down")
sys.exit(0) sys.exit(0)
# Register the signal handlers for Docker
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Initialize the scheduler when the app starts without relying on @before_first_request # Initialize the scheduler when the app starts without relying on @before_first_request
# which is deprecated in newer Flask versions # which is deprecated in newer Flask versions
with app.app_context(): with app.app_context():
@@ -1388,7 +1329,6 @@ def add_compression(response):
'Content-Encoding' in response.headers or 'Content-Encoding' in response.headers or
response.direct_passthrough): response.direct_passthrough):
return response return response
# Only compress specific MIME types # Only compress specific MIME types
content_type = response.headers.get('Content-Type', '') content_type = response.headers.get('Content-Type', '')
compressible_types = [ compressible_types = [
@@ -1400,20 +1340,15 @@ def add_compression(response):
'application/xml', 'application/xml',
'text/xml' 'text/xml'
] ]
if not any(t in content_type for t in compressible_types): if not any(t in content_type for t in compressible_types):
return response return response
accept_encoding = request.headers.get('Accept-Encoding', '') accept_encoding = request.headers.get('Accept-Encoding', '')
if 'br' in accept_encoding: if 'br' in accept_encoding:
try: try:
# Get the response content # Get the response content
response_data = response.get_data() response_data = response.get_data()
# Compress with Brotli # Compress with Brotli
compressed_data = brotli.compress(response_data, quality=6) compressed_data = brotli.compress(response_data, quality=6)
# Only apply Brotli if it results in smaller size # Only apply Brotli if it results in smaller size
if len(compressed_data) < len(response_data): if len(compressed_data) < len(response_data):
response.set_data(compressed_data) response.set_data(compressed_data)
@@ -1425,7 +1360,160 @@ def add_compression(response):
return response return response
def check_nodes_from_log():
"""Read the most recent node status from the log file."""
global _node_status_cache, _node_status_cache_time
# Return cached result if it's less than 2 minutes old (increased from 60s)
with cache_lock:
if _node_status_cache is not None and _node_status_cache_time is not None:
if (datetime.now() - _node_status_cache_time).total_seconds() < 120:
logger.debug("Using cached node status")
return _node_status_cache
try:
# Load the last log
with open(f"{log_dir}/node_status.json", "r") as file:
data = json.load(file)
newest = {
"date": datetime.now() - relativedelta.relativedelta(years=1),
"nodes": [],
}
for entry in data:
entry_date = datetime.strptime(entry["date"], "%Y-%m-%d %H:%M:%S")
if entry_date > newest["date"]:
newest = entry
newest["date"] = entry_date
# Update the cache
with cache_lock:
_node_status_cache = newest["nodes"]
_node_status_cache_time = datetime.now()
return newest["nodes"]
except Exception as e:
logger.error(f"Error reading node status from log: {e}")
# If we can't read from the log, run a fresh check
return check_nodes()
# Add a lightweight status function for quick status checks
@app.route("/api/quick-status")
@cache.cached(timeout=30) # Cache for 30 seconds
def quick_status():
"""Return a minimal status without expensive node checks"""
try:
# Load the last log
with open(f"{log_dir}/node_status.json", "r") as file:
data = json.load(file)
if not data:
return jsonify({"status": "unknown", "last_check": "never"})
newest_entry = max(data, key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d %H:%M:%S"))
last_check_time = format_last_check(datetime.strptime(newest_entry["date"], "%Y-%m-%d %H:%M:%S"))
# Count nodes with issues
node_status = newest_entry["nodes"]
total_nodes = len(node_status)
nodes_with_issues = 0
for node in node_status:
if (not node["plain_dns"] or not node["doh"] or not node["dot"] or
not node["cert"]["valid"] or not node["cert_853"]["valid"]):
nodes_with_issues += 1
return jsonify({
"status": "ok" if nodes_with_issues == 0 else "issues",
"last_check": last_check_time,
"total_nodes": total_nodes,
"nodes_with_issues": nodes_with_issues
})
except Exception as e:
logger.error(f"Error getting quick status: {e}")
return jsonify({"status": "error", "message": str(e)})
# Optimize check_single_node with shorter timeouts
def check_single_node(ip):
"""Check a single node and return its status."""
logger.info(f"Checking node {ip}")
try:
# Add timeout handling for individual checks
plain_dns_result = False
doh_result = {"status": False, "server": []}
dot_result = False
cert_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
cert_853_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
# Use timeout to limit time spent on each check
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_plain_dns = executor.submit(check_plain_dns, ip)
future_doh = executor.submit(check_doh, ip)
future_dot = executor.submit(check_dot, ip)
future_cert = executor.submit(verify_cert, ip, 443)
future_cert_853 = executor.submit(verify_cert, ip, 853)
# Collect results with timeout
try:
plain_dns_result = future_plain_dns.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Plain DNS check timed out for {ip}: {str(e)}")
try:
doh_result = future_doh.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"DoH check timed out for {ip}: {str(e)}")
try:
dot_result = future_dot.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"DoT check timed out for {ip}: {str(e)}")
try:
cert_result = future_cert.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Cert check timed out for {ip}: {str(e)}")
try:
cert_853_result = future_cert_853.result(timeout=5)
except (concurrent.futures.TimeoutError, Exception) as e:
logger.warning(f"Cert 853 check timed out for {ip}: {str(e)}")
node_status = {
"ip": ip,
"name": node_names[ip] if ip in node_names else ip,
"location": (
node_locations[ip] if ip in node_locations else "Unknown"
),
"plain_dns": plain_dns_result,
"doh": doh_result["status"],
"doh_server": doh_result["server"],
"dot": dot_result,
"cert": cert_result,
"cert_853": cert_853_result,
}
logger.info(f"Node {ip} check complete")
return node_status
except Exception as e:
logger.error(f"Error checking node {ip}: {e}")
# Add a failed entry for this node to ensure it's still included
return {
"ip": ip,
"name": node_names[ip] if ip in node_names else ip,
"location": (
node_locations[ip] if ip in node_locations else "Unknown"
),
"plain_dns": False,
"doh": False,
"doh_server": [],
"dot": False,
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
}
# Run the app with threading enabled
if __name__ == "__main__": if __name__ == "__main__":
# The scheduler is already started in the app context above # The scheduler is already started in the app context above
# Run the Flask app # Run the Flask app with threading for better concurrency
app.run(debug=True, port=5000, host="0.0.0.0") app.run(debug=True, port=5000, host="0.0.0.0", threaded=True)

146
templates/index_fast.html Normal file
View File

@@ -0,0 +1,146 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>HNSDoH Status</title>
<link rel="stylesheet" href="/assets/css/style.css">
<meta name="description" content="HNSDoH Status page - Monitoring the status of HNSDoH resolvers">
<link rel="manifest" href="/manifest.json">
<link rel="icon" type="image/png" href="/favicon.png">
<style>
.loader {
border: 5px solid #f3f3f3;
border-radius: 50%;
border-top: 5px solid #3498db;
width: 40px;
height: 40px;
margin: 20px auto;
animation: spin 1.5s linear infinite;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.quick-status {
text-align: center;
padding: 20px;
margin: 20px;
border-radius: 5px;
background-color: #f5f5f5;
}
.status-ok {
color: green;
}
.status-issues {
color: orange;
}
.status-error {
color: red;
}
</style>
</head>
<body>
<header>
<h1>HNSDoH Status</h1>
<p>Monitoring the status of HNSDoH resolvers</p>
</header>
<div class="quick-status">
<h2>Current Status</h2>
<div id="quick-status-display">Loading...</div>
<div class="loader" id="status-loader"></div>
</div>
<main>
<div id="content">
<div class="loader"></div>
<p>Loading full status data...</p>
<p>This may take a few moments as we check all resolver nodes.</p>
</div>
</main>
<footer>
<p>Made by <a href="https://nathan.woodburn.au">Nathan.Woodburn/</a></p>
</footer>
<script>
// Load quick status first
fetch('/api/quick-status')
.then(response => response.json())
.then(data => {
const statusDisplay = document.getElementById('quick-status-display');
const statusLoader = document.getElementById('status-loader');
let statusClass = 'status-ok';
let statusMessage = 'All systems operational';
if (data.status === 'issues') {
statusClass = 'status-issues';
statusMessage = `${data.nodes_with_issues} out of ${data.total_nodes} nodes have issues`;
} else if (data.status === 'error' || data.status === 'unknown') {
statusClass = 'status-error';
statusMessage = 'Unable to determine system status';
}
statusDisplay.innerHTML = `
<h3 class="${statusClass}">${statusMessage}</h3>
<p>Last check: ${data.last_check}</p>
`;
statusLoader.style.display = 'none';
})
.catch(error => {
document.getElementById('quick-status-display').innerHTML = `
<h3 class="status-error">Error loading status</h3>
<p>${error}</p>
`;
document.getElementById('status-loader').style.display = 'none';
});
// Then load full page data
fetch('/api/nodes')
.then(response => response.json())
.then(nodeData => {
// Once we have node data, get history data
return Promise.all([
Promise.resolve(nodeData),
fetch('/api/history').then(res => res.json())
]);
})
.then(([nodeData, historyData]) => {
// Now we have both datasets, fetch the HTML with them
return fetch('/?' + new URLSearchParams({
_data_loaded: 'true' // Signal to the server we already have data
}));
})
.then(response => response.text())
.then(html => {
document.getElementById('content').innerHTML = html;
// Replace direct links with JS-enhanced versions
document.querySelectorAll('a').forEach(link => {
const href = link.getAttribute('href');
if (href && href.startsWith('/') && !href.includes('fast_load')) {
link.setAttribute('href', href + (href.includes('?') ? '&' : '?') + 'fast_load=true');
}
});
})
.catch(error => {
document.getElementById('content').innerHTML = `
<div class="error">
<h2>Error Loading Data</h2>
<p>There was a problem loading the full status data. Please try refreshing the page.</p>
<p>Error details: ${error}</p>
<a href="/" class="button">Refresh Page</a>
</div>
`;
});
</script>
</body>
</html>