feat: Add status check speedups
All checks were successful
Build Docker / BuildImage (push) Successful in 59s
All checks were successful
Build Docker / BuildImage (push) Successful in 59s
This commit is contained in:
12
main.py
12
main.py
@@ -1,13 +1,13 @@
|
||||
import time
|
||||
import signal
|
||||
import threading
|
||||
import concurrent.futures
|
||||
from flask import Flask
|
||||
from server import app
|
||||
from server import app, node_check_executor
|
||||
import server
|
||||
from gunicorn.app.base import BaseApplication
|
||||
import os
|
||||
import dotenv
|
||||
import concurrent.futures
|
||||
import schedule
|
||||
|
||||
|
||||
@@ -58,6 +58,10 @@ def run_gunicorn():
|
||||
def signal_handler(sig, frame):
|
||||
print("Shutting down gracefully...", flush=True)
|
||||
stop_event.set()
|
||||
|
||||
# Shutdown the node check executor
|
||||
print("Shutting down thread pools...", flush=True)
|
||||
node_check_executor.shutdown(wait=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -79,6 +83,10 @@ if __name__ == '__main__':
|
||||
finally:
|
||||
stop_event.set()
|
||||
scheduler_future.cancel()
|
||||
|
||||
# Make sure to shut down node check executor
|
||||
node_check_executor.shutdown(wait=False)
|
||||
|
||||
try:
|
||||
scheduler_future.result(timeout=5)
|
||||
except concurrent.futures.CancelledError:
|
||||
|
||||
546
server.py
546
server.py
@@ -38,11 +38,22 @@ import functools
|
||||
import io
|
||||
import brotli
|
||||
from io import BytesIO
|
||||
import concurrent.futures
|
||||
from threading import Lock
|
||||
|
||||
# Set up logging BEFORE attempting imports that might fail
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Set up ThreadPoolExecutor for parallel node checking
|
||||
# Use a reasonable number of workers based on CPU cores
|
||||
node_check_executor = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=min(32, os.cpu_count() * 4) # Max 32 workers or 4x CPU cores
|
||||
)
|
||||
|
||||
# Create a lock for thread safety when updating cache
|
||||
cache_lock = Lock()
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Configure caching
|
||||
@@ -207,7 +218,9 @@ def retry(max_attempts=3, delay_seconds=1):
|
||||
while attempts < max_attempts:
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except (socket.timeout, socket.error, dns.exception.Timeout, requests.exceptions.RequestException) as e:
|
||||
except (socket.timeout, socket.error, dns.exception.Timeout,
|
||||
requests.exceptions.RequestException, ConnectionRefusedError,
|
||||
ConnectionResetError, OSError, ssl.SSLError) as e:
|
||||
attempts += 1
|
||||
last_error = e
|
||||
logger.warning(f"Attempt {attempts} failed with error: {e} - retrying in {delay_seconds} seconds")
|
||||
@@ -219,12 +232,13 @@ def retry(max_attempts=3, delay_seconds=1):
|
||||
return decorator
|
||||
|
||||
|
||||
# Optimize socket timeout settings
|
||||
@retry(max_attempts=3, delay_seconds=2)
|
||||
def check_plain_dns(ip: str) -> bool:
|
||||
resolver = dns.resolver.Resolver()
|
||||
resolver.nameservers = [ip]
|
||||
resolver.timeout = 5 # Set a reasonable timeout
|
||||
resolver.lifetime = 5 # Total timeout for the query
|
||||
resolver.timeout = 3 # Reduced from 5 seconds to 3 seconds
|
||||
resolver.lifetime = 3 # Reduced from 5 seconds to 3 seconds
|
||||
|
||||
try:
|
||||
result = resolver.resolve("1.wdbrn", "TXT")
|
||||
@@ -273,13 +287,13 @@ def check_doh(ip: str) -> dict:
|
||||
)
|
||||
wireframe_request = request.encode() + dns_query
|
||||
|
||||
# Create socket with timeout
|
||||
sock = socket.create_connection((ip, 443), timeout=10)
|
||||
# Create socket with reduced timeout
|
||||
sock = socket.create_connection((ip, 443), timeout=5) # Reduced from 10 to 5 seconds
|
||||
context = ssl.create_default_context()
|
||||
context.check_hostname = False # Skip hostname verification for IP-based connection
|
||||
ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com")
|
||||
|
||||
ssock.settimeout(10) # Set a timeout for socket operations
|
||||
ssock.settimeout(5) # Reduced from 10 to 5 seconds
|
||||
ssock.sendall(wireframe_request)
|
||||
|
||||
response_data = b""
|
||||
@@ -354,7 +368,7 @@ def check_dot(ip: str) -> bool:
|
||||
q = dns.message.make_query(qname, dns.rdatatype.TXT)
|
||||
try:
|
||||
response = dns.query.tls(
|
||||
q, ip, timeout=5, port=853, server_hostname="hnsdoh.com"
|
||||
q, ip, timeout=3, port=853, server_hostname="hnsdoh.com" # Reduced from 5 to 3 seconds
|
||||
)
|
||||
if response.rcode() == dns.rcode.NOERROR:
|
||||
for rrset in response.answer:
|
||||
@@ -382,12 +396,12 @@ def verify_cert(ip: str, port: int) -> dict:
|
||||
ssock = None
|
||||
|
||||
try:
|
||||
sock = socket.create_connection((ip, port), timeout=10)
|
||||
sock = socket.create_connection((ip, port), timeout=5) # Reduced from 10 to 5 seconds
|
||||
# Wrap the socket in SSL/TLS
|
||||
context = ssl.create_default_context()
|
||||
context.check_hostname = False # Skip hostname verification for IP-based connection
|
||||
ssock = context.wrap_socket(sock, server_hostname="hnsdoh.com")
|
||||
ssock.settimeout(10) # Set timeout for socket operations
|
||||
ssock.settimeout(5) # Reduced from 10 to 5 seconds
|
||||
|
||||
# Retrieve the server's certificate
|
||||
cert = ssock.getpeercert()
|
||||
@@ -469,7 +483,7 @@ def format_last_check(last_log: datetime) -> str:
|
||||
|
||||
|
||||
def check_nodes() -> list:
|
||||
global nodes
|
||||
global nodes, _node_status_cache, _node_status_cache_time
|
||||
if last_log > datetime.now() - relativedelta.relativedelta(minutes=1):
|
||||
# Load the last log
|
||||
with open(f"{log_dir}/node_status.json", "r") as file:
|
||||
@@ -487,53 +501,43 @@ def check_nodes() -> list:
|
||||
if len(nodes) == 0:
|
||||
nodes = get_node_list()
|
||||
|
||||
# Use ThreadPoolExecutor to check nodes in parallel
|
||||
futures = {}
|
||||
node_status = []
|
||||
|
||||
# Submit all node checks to the executor
|
||||
for ip in nodes:
|
||||
logger.info(f"Checking node {ip}")
|
||||
futures[node_check_executor.submit(check_single_node, ip)] = ip
|
||||
|
||||
# Collect results as they complete
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
ip = futures[future]
|
||||
try:
|
||||
plain_dns_result = check_plain_dns(ip)
|
||||
doh_check = check_doh(ip)
|
||||
dot_result = check_dot(ip)
|
||||
cert_result = verify_cert(ip, 443)
|
||||
cert_853_result = verify_cert(ip, 853)
|
||||
|
||||
node_status.append(
|
||||
{
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": plain_dns_result,
|
||||
"doh": doh_check["status"],
|
||||
"doh_server": doh_check["server"],
|
||||
"dot": dot_result,
|
||||
"cert": cert_result,
|
||||
"cert_853": cert_853_result,
|
||||
}
|
||||
)
|
||||
logger.info(f"Node {ip} check complete")
|
||||
node_result = future.result()
|
||||
node_status.append(node_result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking node {ip}: {e}")
|
||||
# Add a failed entry for this node to ensure it's still included
|
||||
node_status.append(
|
||||
{
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": False,
|
||||
"doh": False,
|
||||
"doh_server": [],
|
||||
"dot": False,
|
||||
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
}
|
||||
)
|
||||
logger.error(f"Error processing results for node {ip}: {e}")
|
||||
# Ensure a failed node entry is still included
|
||||
node_status.append({
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (node_locations[ip] if ip in node_locations else "Unknown"),
|
||||
"plain_dns": False,
|
||||
"doh": False,
|
||||
"doh_server": [],
|
||||
"dot": False,
|
||||
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
})
|
||||
|
||||
# Save the node status to a file
|
||||
log_status(node_status)
|
||||
|
||||
# Update the in-memory cache with thread safety
|
||||
with cache_lock:
|
||||
_node_status_cache = node_status
|
||||
_node_status_cache_time = datetime.now()
|
||||
|
||||
logger.info("Finished checking nodes")
|
||||
|
||||
# Send notifications if any nodes are down
|
||||
@@ -567,156 +571,84 @@ def check_nodes() -> list:
|
||||
return node_status
|
||||
|
||||
|
||||
# Optimize check_nodes_from_log function with in-memory caching
|
||||
def check_nodes_from_log() -> list:
|
||||
global last_log, _node_status_cache, _node_status_cache_time
|
||||
|
||||
# Check if we have a valid cache
|
||||
current_time = datetime.now()
|
||||
staleness_threshold_str = os.getenv("STALENESS_THRESHOLD_MINUTES", "15")
|
||||
|
||||
def check_single_node(ip):
|
||||
"""Check a single node and return its status."""
|
||||
logger.info(f"Checking node {ip}")
|
||||
try:
|
||||
staleness_threshold = int(staleness_threshold_str)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid STALENESS_THRESHOLD_MINUTES value: {staleness_threshold_str}")
|
||||
staleness_threshold = 15
|
||||
|
||||
# Use in-memory cache if it's fresh enough
|
||||
if (_node_status_cache is not None and _node_status_cache_time is not None and
|
||||
current_time < _node_status_cache_time + relativedelta.relativedelta(minutes=staleness_threshold/2)):
|
||||
logger.info(f"Using in-memory cache from {format_last_check(_node_status_cache_time)}")
|
||||
return _node_status_cache
|
||||
|
||||
# Otherwise load from disk or run a new check
|
||||
try:
|
||||
with open(f"{log_dir}/node_status.json", "r") as file:
|
||||
data = json.load(file)
|
||||
# Add timeout handling for individual checks
|
||||
plain_dns_result = False
|
||||
doh_result = {"status": False, "server": []}
|
||||
dot_result = False
|
||||
cert_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
|
||||
cert_853_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
|
||||
|
||||
newest = {
|
||||
"date": datetime.now() - relativedelta.relativedelta(years=1),
|
||||
"nodes": [],
|
||||
# Use timeout to limit time spent on each check
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
future_plain_dns = executor.submit(check_plain_dns, ip)
|
||||
future_doh = executor.submit(check_doh, ip)
|
||||
future_dot = executor.submit(check_dot, ip)
|
||||
future_cert = executor.submit(verify_cert, ip, 443)
|
||||
future_cert_853 = executor.submit(verify_cert, ip, 853)
|
||||
|
||||
# Collect results with timeout
|
||||
try:
|
||||
plain_dns_result = future_plain_dns.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Plain DNS check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
doh_result = future_doh.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"DoH check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
dot_result = future_dot.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"DoT check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
cert_result = future_cert.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Cert check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
cert_853_result = future_cert_853.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Cert 853 check timed out for {ip}: {str(e)}")
|
||||
|
||||
node_status = {
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": plain_dns_result,
|
||||
"doh": doh_result["status"],
|
||||
"doh_server": doh_result["server"],
|
||||
"dot": dot_result,
|
||||
"cert": cert_result,
|
||||
"cert_853": cert_853_result,
|
||||
}
|
||||
|
||||
for entry in data:
|
||||
if datetime.strptime(entry["date"], "%Y-%m-%d %H:%M:%S") > newest["date"]:
|
||||
newest = entry
|
||||
newest["date"] = datetime.strptime(newest["date"], "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
node_status = newest["nodes"]
|
||||
|
||||
if current_time > newest["date"] + relativedelta.relativedelta(minutes=staleness_threshold):
|
||||
logger.warning(f"Data is stale (older than {staleness_threshold} minutes), triggering immediate check")
|
||||
node_status = check_nodes()
|
||||
else:
|
||||
last_log = newest["date"]
|
||||
logger.info(f"Using cached node status from {format_last_check(last_log)}")
|
||||
|
||||
# Update the in-memory cache
|
||||
_node_status_cache = node_status
|
||||
_node_status_cache_time = current_time
|
||||
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error reading node status file: {e}")
|
||||
logger.info("Running initial node check")
|
||||
node_status = check_nodes()
|
||||
|
||||
# Update the in-memory cache
|
||||
_node_status_cache = node_status
|
||||
_node_status_cache_time = current_time
|
||||
|
||||
return node_status
|
||||
|
||||
|
||||
def send_notification(title, description, author):
|
||||
discord_hook = os.getenv("DISCORD_HOOK")
|
||||
if discord_hook:
|
||||
data = {
|
||||
"content": "",
|
||||
"embeds": [
|
||||
{
|
||||
"title": title,
|
||||
"description": description,
|
||||
"url": "https://status.hnsdoh.com",
|
||||
"color": 5814783,
|
||||
"author": {
|
||||
"name": author,
|
||||
"icon_url": "https://status.hnsdoh.com/favicon.png",
|
||||
},
|
||||
}
|
||||
],
|
||||
"username": "HNSDoH",
|
||||
"avatar_url": "https://status.hnsdoh.com/favicon.png",
|
||||
"attachments": [],
|
||||
logger.info(f"Node {ip} check complete")
|
||||
return node_status
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking node {ip}: {e}")
|
||||
# Add a failed entry for this node to ensure it's still included
|
||||
return {
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": False,
|
||||
"doh": False,
|
||||
"doh_server": [],
|
||||
"dot": False,
|
||||
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
}
|
||||
response = requests.post(discord_hook, json=data)
|
||||
print("Sent notification", flush=True)
|
||||
else:
|
||||
print("No discord hook", flush=True)
|
||||
|
||||
|
||||
def send_down_notification(node):
|
||||
global sent_notifications
|
||||
|
||||
# Check if a notification has already been sent
|
||||
if node["ip"] not in sent_notifications:
|
||||
sent_notifications[node["ip"]] = datetime.strftime(
|
||||
datetime.now(), "%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
else:
|
||||
last_send = datetime.strptime(
|
||||
sent_notifications[node["ip"]], "%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
if last_send > datetime.now() - relativedelta.relativedelta(hours=1):
|
||||
print(
|
||||
f"Notification already sent for {node['name']} in the last hr",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
# Only send certain notifications once per day
|
||||
if node["plain_dns"] and node["doh"] and node["dot"]:
|
||||
if last_send > datetime.now() - relativedelta.relativedelta(days=1):
|
||||
print(
|
||||
f"Notification already sent for {node['name']} in the last day",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
# Save the notification to the file
|
||||
sent_notifications[node["ip"]] = datetime.strftime(
|
||||
datetime.now(), "%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
with open(f"{log_dir}/sent_notifications.json", "w") as file:
|
||||
json.dump(sent_notifications, file, indent=4)
|
||||
|
||||
title = f"{node['name']} is down"
|
||||
|
||||
description = f"{node['name']} ({node['ip']}) is down with the following issues:\n"
|
||||
if not node["plain_dns"]:
|
||||
description += "- Plain DNS is down\n"
|
||||
if not node["doh"]:
|
||||
description += "- DoH is down\n"
|
||||
if not node["dot"]:
|
||||
description += "- DoT is down\n"
|
||||
if not node["cert"]["valid"]:
|
||||
description += "- Certificate on port 443 is invalid\n"
|
||||
if not node["cert_853"]["valid"]:
|
||||
description += "- Certificate on port 853 is invalid\n"
|
||||
|
||||
if node["plain_dns"] and node["doh"] and node["dot"]:
|
||||
if node["cert"]["valid"] and node["cert_853"]["valid"]:
|
||||
description = f"The certificate on {node['name']} ({node['ip']}) is expiring soon\n"
|
||||
title = f"{node['name']} certificate is expiring soon"
|
||||
# Also add the expiry date of the certificates
|
||||
description += "\nCertificate expiry dates:\n"
|
||||
description += f"- Certificate on port 443 expires {node['cert']['expires']}\n"
|
||||
description += f"- Certificate on port 853 expires {node['cert_853']['expires']}\n"
|
||||
send_notification(title, description, node["name"])
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
# region File logs
|
||||
|
||||
|
||||
@@ -786,9 +718,9 @@ def create_default_node_dict():
|
||||
"name": "",
|
||||
"location": "",
|
||||
"ip": "",
|
||||
"plain_dns": {"last_down": "Never", "percentage": 0},
|
||||
"doh": {"last_down": "Never", "percentage": 0},
|
||||
"dot": {"last_down": "Never", "percentage": 0},
|
||||
"plain_dns": {"last_down": "never", "percentage": 0},
|
||||
"doh": {"last_down": "never", "percentage": 0},
|
||||
"dot": {"last_down": "never", "percentage": 0},
|
||||
}
|
||||
|
||||
def create_default_counts_dict():
|
||||
@@ -804,9 +736,9 @@ def summarize_history(history: list) -> dict:
|
||||
nodes_status = defaultdict(create_default_node_dict)
|
||||
|
||||
overall_status = {
|
||||
"plain_dns": {"last_down": "Never", "percentage": 0},
|
||||
"doh": {"last_down": "Never", "percentage": 0},
|
||||
"dot": {"last_down": "Never", "percentage": 0},
|
||||
"plain_dns": {"last_down": "never", "percentage": 0},
|
||||
"doh": {"last_down": "never", "percentage": 0},
|
||||
"dot": {"last_down": "never", "percentage": 0},
|
||||
}
|
||||
|
||||
# Collect data
|
||||
@@ -834,7 +766,7 @@ def summarize_history(history: list) -> dict:
|
||||
for key in ["plain_dns", "doh", "dot"]:
|
||||
if node.get(key) == False:
|
||||
# Check if the last downtime is more recent
|
||||
if nodes_status[ip][key]["last_down"] == "Never":
|
||||
if nodes_status[ip][key]["last_down"] == "never":
|
||||
nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
elif date > datetime.strptime(nodes_status[ip][key]["last_down"], "%Y-%m-%d %H:%M:%S"):
|
||||
nodes_status[ip][key]["last_down"] = date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
@@ -875,7 +807,7 @@ def summarize_history(history: list) -> dict:
|
||||
last_downs = [
|
||||
nodes_status[ip][key]["last_down"]
|
||||
for ip in nodes_status
|
||||
if nodes_status[ip][key]["last_down"] != "Never"
|
||||
if nodes_status[ip][key]["last_down"] != "never"
|
||||
]
|
||||
if last_downs:
|
||||
overall_status[key]["last_down"] = max(last_downs)
|
||||
@@ -961,7 +893,7 @@ def api_index():
|
||||
|
||||
# Cache node status for API requests
|
||||
@app.route("/api/nodes")
|
||||
@cache.cached(timeout=60) # Cache for 1 minute
|
||||
@cache.cached(timeout=300) # Increased from 60s to 5 minutes
|
||||
def api_nodes():
|
||||
node_status = check_nodes_from_log()
|
||||
return jsonify(node_status)
|
||||
@@ -1118,6 +1050,14 @@ def api_errors():
|
||||
@app.route("/api/check/<ip>")
|
||||
@cache.cached(timeout=30) # Cache for 30 seconds
|
||||
def api_check(ip: str):
|
||||
# Verify IP is one of the nodes
|
||||
global nodes
|
||||
if not nodes:
|
||||
return jsonify({"error": "No nodes available"}), 404
|
||||
if ip not in nodes:
|
||||
return jsonify({"error": f"Node {ip} not found"}), 404
|
||||
|
||||
|
||||
logger.info(f"Checking node {ip}")
|
||||
data = {
|
||||
"ip": ip,
|
||||
@@ -1150,8 +1090,19 @@ def api_check(ip: str):
|
||||
# region Main routes
|
||||
# Cache the main page rendering
|
||||
@app.route("/")
|
||||
@cache.cached(timeout=60, query_string=True) # Cache for 1 minute, respect query params
|
||||
@cache.cached(timeout=120, query_string=True) # Increased from 60s to 2 minutes
|
||||
def index():
|
||||
# Check for fast_load parameter to provide a quicker initial page load
|
||||
fast_load = request.args.get('fast_load', 'false').lower() == 'true'
|
||||
|
||||
if fast_load:
|
||||
# Return a minimal template that will load data via JavaScript
|
||||
return render_template(
|
||||
"index_fast.html",
|
||||
api_url=request.url_root + "api"
|
||||
)
|
||||
|
||||
# Original slower but complete load
|
||||
node_status = check_nodes_from_log()
|
||||
|
||||
alerts = []
|
||||
@@ -1218,7 +1169,7 @@ def index():
|
||||
# Convert time to relative time
|
||||
for node in history_summary["nodes"]:
|
||||
for key in ["plain_dns", "doh", "dot"]:
|
||||
if node[key]["last_down"] == "Never":
|
||||
if node[key]["last_down"] == "never":
|
||||
node[key]["last_down"] = "over 30 days ago"
|
||||
else:
|
||||
node[key]["last_down"] = format_last_check(
|
||||
@@ -1226,7 +1177,7 @@ def index():
|
||||
)
|
||||
|
||||
for key in ["plain_dns", "doh", "dot"]:
|
||||
if history_summary["overall"][key]["last_down"] == "Never":
|
||||
if history_summary["overall"][key]["last_down"] == "never":
|
||||
continue
|
||||
history_summary["overall"][key]["last_down"] = format_last_check(
|
||||
datetime.strptime(history_summary["overall"][key]["last_down"], "%Y-%m-%d %H:%M:%S")
|
||||
@@ -1307,20 +1258,16 @@ def scheduled_node_check():
|
||||
global nodes, _node_status_cache, _node_status_cache_time
|
||||
nodes = [] # Reset node list to force refresh
|
||||
|
||||
# Run the check and update in-memory cache
|
||||
# Run the check (which now uses ThreadPoolExecutor)
|
||||
node_status = check_nodes()
|
||||
_node_status_cache = node_status
|
||||
_node_status_cache_time = datetime.now()
|
||||
|
||||
# Clear relevant caches
|
||||
cache.delete_memoized(api_nodes)
|
||||
cache.delete_memoized(api_errors)
|
||||
cache.delete_memoized(index)
|
||||
|
||||
logger.info("Completed scheduled node check and updated caches")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scheduled node check: {e}")
|
||||
|
||||
|
||||
def scheduler_listener(event):
|
||||
"""Listener for scheduler events"""
|
||||
if event.exception:
|
||||
@@ -1339,7 +1286,6 @@ def start_scheduler():
|
||||
check_interval = 5
|
||||
|
||||
logger.info(f"Setting up scheduler to run every {check_interval} minutes")
|
||||
|
||||
# Add the job to the scheduler
|
||||
scheduler.add_job(
|
||||
scheduled_node_check,
|
||||
@@ -1347,10 +1293,9 @@ def start_scheduler():
|
||||
id='node_check_job',
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
logger.info(f"Setting up scheduler to run every {check_interval} minutes")
|
||||
# Add listener for job events
|
||||
scheduler.add_listener(scheduler_listener, EVENT_JOB_ERROR | EVENT_JOB_EXECUTED)
|
||||
|
||||
# Start the scheduler if it's not already running
|
||||
if not scheduler.running:
|
||||
scheduler.start()
|
||||
@@ -1364,10 +1309,6 @@ def signal_handler(sig, frame):
|
||||
logger.info("Scheduler shut down")
|
||||
sys.exit(0)
|
||||
|
||||
# Register the signal handlers for Docker
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Initialize the scheduler when the app starts without relying on @before_first_request
|
||||
# which is deprecated in newer Flask versions
|
||||
with app.app_context():
|
||||
@@ -1388,32 +1329,26 @@ def add_compression(response):
|
||||
'Content-Encoding' in response.headers or
|
||||
response.direct_passthrough):
|
||||
return response
|
||||
|
||||
# Only compress specific MIME types
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
compressible_types = [
|
||||
'text/html',
|
||||
'text/css',
|
||||
'text/plain',
|
||||
'application/javascript',
|
||||
'application/javascript',
|
||||
'application/json',
|
||||
'application/xml',
|
||||
'text/xml'
|
||||
]
|
||||
|
||||
if not any(t in content_type for t in compressible_types):
|
||||
return response
|
||||
|
||||
accept_encoding = request.headers.get('Accept-Encoding', '')
|
||||
|
||||
if 'br' in accept_encoding:
|
||||
try:
|
||||
# Get the response content
|
||||
response_data = response.get_data()
|
||||
|
||||
# Compress with Brotli
|
||||
compressed_data = brotli.compress(response_data, quality=6)
|
||||
|
||||
# Only apply Brotli if it results in smaller size
|
||||
if len(compressed_data) < len(response_data):
|
||||
response.set_data(compressed_data)
|
||||
@@ -1422,10 +1357,163 @@ def add_compression(response):
|
||||
except Exception as e:
|
||||
logger.warning(f"Brotli compression failed: {e}")
|
||||
# If compression fails, we just return the uncompressed response
|
||||
|
||||
|
||||
return response
|
||||
|
||||
def check_nodes_from_log():
|
||||
"""Read the most recent node status from the log file."""
|
||||
global _node_status_cache, _node_status_cache_time
|
||||
|
||||
# Return cached result if it's less than 2 minutes old (increased from 60s)
|
||||
with cache_lock:
|
||||
if _node_status_cache is not None and _node_status_cache_time is not None:
|
||||
if (datetime.now() - _node_status_cache_time).total_seconds() < 120:
|
||||
logger.debug("Using cached node status")
|
||||
return _node_status_cache
|
||||
|
||||
try:
|
||||
# Load the last log
|
||||
with open(f"{log_dir}/node_status.json", "r") as file:
|
||||
data = json.load(file)
|
||||
|
||||
newest = {
|
||||
"date": datetime.now() - relativedelta.relativedelta(years=1),
|
||||
"nodes": [],
|
||||
}
|
||||
|
||||
for entry in data:
|
||||
entry_date = datetime.strptime(entry["date"], "%Y-%m-%d %H:%M:%S")
|
||||
if entry_date > newest["date"]:
|
||||
newest = entry
|
||||
newest["date"] = entry_date
|
||||
|
||||
# Update the cache
|
||||
with cache_lock:
|
||||
_node_status_cache = newest["nodes"]
|
||||
_node_status_cache_time = datetime.now()
|
||||
|
||||
return newest["nodes"]
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading node status from log: {e}")
|
||||
# If we can't read from the log, run a fresh check
|
||||
return check_nodes()
|
||||
|
||||
# Add a lightweight status function for quick status checks
|
||||
@app.route("/api/quick-status")
|
||||
@cache.cached(timeout=30) # Cache for 30 seconds
|
||||
def quick_status():
|
||||
"""Return a minimal status without expensive node checks"""
|
||||
try:
|
||||
# Load the last log
|
||||
with open(f"{log_dir}/node_status.json", "r") as file:
|
||||
data = json.load(file)
|
||||
|
||||
if not data:
|
||||
return jsonify({"status": "unknown", "last_check": "never"})
|
||||
|
||||
newest_entry = max(data, key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d %H:%M:%S"))
|
||||
last_check_time = format_last_check(datetime.strptime(newest_entry["date"], "%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
# Count nodes with issues
|
||||
node_status = newest_entry["nodes"]
|
||||
total_nodes = len(node_status)
|
||||
nodes_with_issues = 0
|
||||
|
||||
for node in node_status:
|
||||
if (not node["plain_dns"] or not node["doh"] or not node["dot"] or
|
||||
not node["cert"]["valid"] or not node["cert_853"]["valid"]):
|
||||
nodes_with_issues += 1
|
||||
|
||||
return jsonify({
|
||||
"status": "ok" if nodes_with_issues == 0 else "issues",
|
||||
"last_check": last_check_time,
|
||||
"total_nodes": total_nodes,
|
||||
"nodes_with_issues": nodes_with_issues
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting quick status: {e}")
|
||||
return jsonify({"status": "error", "message": str(e)})
|
||||
|
||||
# Optimize check_single_node with shorter timeouts
|
||||
def check_single_node(ip):
|
||||
"""Check a single node and return its status."""
|
||||
logger.info(f"Checking node {ip}")
|
||||
try:
|
||||
# Add timeout handling for individual checks
|
||||
plain_dns_result = False
|
||||
doh_result = {"status": False, "server": []}
|
||||
dot_result = False
|
||||
cert_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
|
||||
cert_853_result = {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"}
|
||||
|
||||
# Use timeout to limit time spent on each check
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
future_plain_dns = executor.submit(check_plain_dns, ip)
|
||||
future_doh = executor.submit(check_doh, ip)
|
||||
future_dot = executor.submit(check_dot, ip)
|
||||
future_cert = executor.submit(verify_cert, ip, 443)
|
||||
future_cert_853 = executor.submit(verify_cert, ip, 853)
|
||||
|
||||
# Collect results with timeout
|
||||
try:
|
||||
plain_dns_result = future_plain_dns.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Plain DNS check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
doh_result = future_doh.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"DoH check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
dot_result = future_dot.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"DoT check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
cert_result = future_cert.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Cert check timed out for {ip}: {str(e)}")
|
||||
|
||||
try:
|
||||
cert_853_result = future_cert_853.result(timeout=5)
|
||||
except (concurrent.futures.TimeoutError, Exception) as e:
|
||||
logger.warning(f"Cert 853 check timed out for {ip}: {str(e)}")
|
||||
|
||||
node_status = {
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": plain_dns_result,
|
||||
"doh": doh_result["status"],
|
||||
"doh_server": doh_result["server"],
|
||||
"dot": dot_result,
|
||||
"cert": cert_result,
|
||||
"cert_853": cert_853_result,
|
||||
}
|
||||
logger.info(f"Node {ip} check complete")
|
||||
return node_status
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking node {ip}: {e}")
|
||||
# Add a failed entry for this node to ensure it's still included
|
||||
return {
|
||||
"ip": ip,
|
||||
"name": node_names[ip] if ip in node_names else ip,
|
||||
"location": (
|
||||
node_locations[ip] if ip in node_locations else "Unknown"
|
||||
),
|
||||
"plain_dns": False,
|
||||
"doh": False,
|
||||
"doh_server": [],
|
||||
"dot": False,
|
||||
"cert": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
"cert_853": {"valid": False, "expires": "ERROR", "expiry_date": "ERROR"},
|
||||
}
|
||||
|
||||
# Run the app with threading enabled
|
||||
if __name__ == "__main__":
|
||||
# The scheduler is already started in the app context above
|
||||
# Run the Flask app
|
||||
app.run(debug=True, port=5000, host="0.0.0.0")
|
||||
# Run the Flask app with threading for better concurrency
|
||||
app.run(debug=True, port=5000, host="0.0.0.0", threaded=True)
|
||||
|
||||
146
templates/index_fast.html
Normal file
146
templates/index_fast.html
Normal file
@@ -0,0 +1,146 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>HNSDoH Status</title>
|
||||
<link rel="stylesheet" href="/assets/css/style.css">
|
||||
<meta name="description" content="HNSDoH Status page - Monitoring the status of HNSDoH resolvers">
|
||||
<link rel="manifest" href="/manifest.json">
|
||||
<link rel="icon" type="image/png" href="/favicon.png">
|
||||
<style>
|
||||
.loader {
|
||||
border: 5px solid #f3f3f3;
|
||||
border-radius: 50%;
|
||||
border-top: 5px solid #3498db;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
margin: 20px auto;
|
||||
animation: spin 1.5s linear infinite;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
.quick-status {
|
||||
text-align: center;
|
||||
padding: 20px;
|
||||
margin: 20px;
|
||||
border-radius: 5px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.status-ok {
|
||||
color: green;
|
||||
}
|
||||
|
||||
.status-issues {
|
||||
color: orange;
|
||||
}
|
||||
|
||||
.status-error {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>HNSDoH Status</h1>
|
||||
<p>Monitoring the status of HNSDoH resolvers</p>
|
||||
</header>
|
||||
|
||||
<div class="quick-status">
|
||||
<h2>Current Status</h2>
|
||||
<div id="quick-status-display">Loading...</div>
|
||||
<div class="loader" id="status-loader"></div>
|
||||
</div>
|
||||
|
||||
<main>
|
||||
<div id="content">
|
||||
<div class="loader"></div>
|
||||
<p>Loading full status data...</p>
|
||||
<p>This may take a few moments as we check all resolver nodes.</p>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<footer>
|
||||
<p>Made by <a href="https://nathan.woodburn.au">Nathan.Woodburn/</a></p>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// Load quick status first
|
||||
fetch('/api/quick-status')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
const statusDisplay = document.getElementById('quick-status-display');
|
||||
const statusLoader = document.getElementById('status-loader');
|
||||
|
||||
let statusClass = 'status-ok';
|
||||
let statusMessage = 'All systems operational';
|
||||
|
||||
if (data.status === 'issues') {
|
||||
statusClass = 'status-issues';
|
||||
statusMessage = `${data.nodes_with_issues} out of ${data.total_nodes} nodes have issues`;
|
||||
} else if (data.status === 'error' || data.status === 'unknown') {
|
||||
statusClass = 'status-error';
|
||||
statusMessage = 'Unable to determine system status';
|
||||
}
|
||||
|
||||
statusDisplay.innerHTML = `
|
||||
<h3 class="${statusClass}">${statusMessage}</h3>
|
||||
<p>Last check: ${data.last_check}</p>
|
||||
`;
|
||||
|
||||
statusLoader.style.display = 'none';
|
||||
})
|
||||
.catch(error => {
|
||||
document.getElementById('quick-status-display').innerHTML = `
|
||||
<h3 class="status-error">Error loading status</h3>
|
||||
<p>${error}</p>
|
||||
`;
|
||||
document.getElementById('status-loader').style.display = 'none';
|
||||
});
|
||||
|
||||
// Then load full page data
|
||||
fetch('/api/nodes')
|
||||
.then(response => response.json())
|
||||
.then(nodeData => {
|
||||
// Once we have node data, get history data
|
||||
return Promise.all([
|
||||
Promise.resolve(nodeData),
|
||||
fetch('/api/history').then(res => res.json())
|
||||
]);
|
||||
})
|
||||
.then(([nodeData, historyData]) => {
|
||||
// Now we have both datasets, fetch the HTML with them
|
||||
return fetch('/?' + new URLSearchParams({
|
||||
_data_loaded: 'true' // Signal to the server we already have data
|
||||
}));
|
||||
})
|
||||
.then(response => response.text())
|
||||
.then(html => {
|
||||
document.getElementById('content').innerHTML = html;
|
||||
|
||||
// Replace direct links with JS-enhanced versions
|
||||
document.querySelectorAll('a').forEach(link => {
|
||||
const href = link.getAttribute('href');
|
||||
if (href && href.startsWith('/') && !href.includes('fast_load')) {
|
||||
link.setAttribute('href', href + (href.includes('?') ? '&' : '?') + 'fast_load=true');
|
||||
}
|
||||
});
|
||||
})
|
||||
.catch(error => {
|
||||
document.getElementById('content').innerHTML = `
|
||||
<div class="error">
|
||||
<h2>Error Loading Data</h2>
|
||||
<p>There was a problem loading the full status data. Please try refreshing the page.</p>
|
||||
<p>Error details: ${error}</p>
|
||||
<a href="/" class="button">Refresh Page</a>
|
||||
</div>
|
||||
`;
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user