#!/usr/bin/env python3 """ sitemap_analyzer.py — Analyzes sitemap.xml files for structure, depth, and potential issues. Usage: python3 sitemap_analyzer.py [sitemap.xml] python3 sitemap_analyzer.py https://example.com/sitemap.xml (fetches via urllib) cat sitemap.xml | python3 sitemap_analyzer.py If no file is provided, runs on embedded sample sitemap for demonstration. Output: Structural analysis with depth distribution, URL patterns, orphan candidates, duplicate path detection, and JSON summary. Stdlib only — no external dependencies. """ import json import sys import re import select import urllib.request import urllib.error from collections import Counter, defaultdict from urllib.parse import urlparse import xml.etree.ElementTree as ET # ─── Namespaces used in sitemaps ───────────────────────────────────────────── SITEMAP_NAMESPACES = { "sm": "http://www.sitemaps.org/schemas/sitemap/0.9", "image": "http://www.google.com/schemas/sitemap-image/1.1", "video": "http://www.google.com/schemas/sitemap-video/1.1", "news": "http://www.google.com/schemas/sitemap-news/0.9", "xhtml": "http://www.w3.org/1999/xhtml", } # ─── Sample sitemap (embedded) ──────────────────────────────────────────────── SAMPLE_SITEMAP = """ https://example.com/ daily 1.0 https://example.com/pricing https://example.com/about https://example.com/contact https://example.com/blog https://example.com/features https://example.com/features/email-automation https://example.com/features/crm-integration https://example.com/features/analytics https://example.com/solutions/sales-teams https://example.com/solutions/marketing-teams https://example.com/blog/cold-email-guide https://example.com/blog/email-open-rates https://example.com/blog/crm-comparison https://example.com/blog/sales-process-optimization https://example.com/resources/guides/email/cold-outreach/advanced/templates https://example.com/resources/guides/email/cold-outreach/advanced/scripts https://example.com/blog/email-tips https://example.com/resources/email-tips https://example.com/search?q=cold+email&sort=recent https://example.com/customers/acme-corp https://example.com/customers/globex https://example.com/privacy https://example.com/terms """ # ─── URL Analysis ───────────────────────────────────────────────────────────── def get_depth(path: str) -> int: """Return depth of a URL path. / = 0, /blog = 1, /blog/post = 2, etc.""" parts = [p for p in path.strip("/").split("/") if p] return len(parts) def get_path_pattern(path: str) -> str: """Replace variable segments with {slug} for pattern detection.""" parts = path.strip("/").split("/") normalized = [] for p in parts: if p: # Keep static segments (likely structure), replace dynamic-looking ones if re.match(r'^[a-z][-a-z]+$', p) and len(p) < 30: normalized.append(p) else: normalized.append("{slug}") return "/" + "/".join(normalized) if normalized else "/" def has_query_params(url: str) -> bool: return "?" in url def looks_like_dynamic_url(url: str) -> bool: parsed = urlparse(url) return bool(parsed.query) def detect_path_siblings(urls: list) -> list: """Find URLs with same slug in different parent directories (potential duplicates).""" slug_to_paths = defaultdict(list) for url in urls: path = urlparse(url).path.strip("/") slug = path.split("/")[-1] if path else "" if slug: slug_to_paths[slug].append(url) duplicates = [] for slug, paths in slug_to_paths.items(): if len(paths) > 1: # Only flag if they're in different directories parents = set("/".join(urlparse(p).path.strip("/").split("/")[:-1]) for p in paths) if len(parents) > 1: duplicates.append({"slug": slug, "urls": paths}) return duplicates # ─── Sitemap Parser ────────────────────────────────────────────────────────── def parse_sitemap(content: str) -> list: """Parse sitemap XML and return list of URL dicts.""" urls = [] # Strip namespace declarations for simpler parsing content_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', content) try: root = ET.fromstring(content_clean) except ET.ParseError as e: print(f"❌ XML parse error: {e}", file=sys.stderr) return [] # Handle sitemap index (points to other sitemaps) if root.tag.endswith("sitemapindex") or root.tag == "sitemapindex": print("ℹ️ This is a sitemap index file — it points to child sitemaps.") print(" Child sitemaps:") for sitemap in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc") or root.findall(".//loc"): print(f" - {sitemap.text}") print(" Run this tool on each child sitemap for full analysis.") return [] # Regular urlset for url_el in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url") or root.findall(".//url"): loc_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc") or url_el.find("loc") lastmod_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod") or url_el.find("lastmod") priority_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}priority") or url_el.find("priority") if loc_el is not None and loc_el.text: urls.append({ "url": loc_el.text.strip(), "lastmod": lastmod_el.text.strip() if lastmod_el is not None and lastmod_el.text else None, "priority": float(priority_el.text.strip()) if priority_el is not None and priority_el.text else None, }) return urls # ─── Analysis Engine ───────────────────────────────────────────────────────── def analyze_urls(urls: list) -> dict: raw_urls = [u["url"] for u in urls] paths = [urlparse(u).path for u in raw_urls] depths = [get_depth(p) for p in paths] depth_counter = Counter(depths) dynamic_urls = [u for u in raw_urls if looks_like_dynamic_url(u)] patterns = Counter(get_path_pattern(urlparse(u).path) for u in raw_urls) top_patterns = patterns.most_common(10) duplicate_slugs = detect_path_siblings(raw_urls) deep_urls = [(u, get_depth(urlparse(u).path)) for u in raw_urls if get_depth(urlparse(u).path) >= 4] # Extract top-level directories top_dirs = Counter() for p in paths: parts = p.strip("/").split("/") if parts and parts[0]: top_dirs[parts[0]] += 1 return { "total_urls": len(urls), "depth_distribution": dict(sorted(depth_counter.items())), "top_directories": dict(top_dirs.most_common(15)), "dynamic_urls": dynamic_urls, "deep_pages": deep_urls, "duplicate_slug_candidates": duplicate_slugs, "top_url_patterns": [{"pattern": p, "count": c} for p, c in top_patterns], } # ─── Report Printer ────────────────────────────────────────────────────────── def grade_depth_distribution(dist: dict) -> str: deep = sum(v for k, v in dist.items() if k >= 4) total = sum(dist.values()) if total == 0: return "N/A" pct = deep / total * 100 if pct < 5: return "🟢 Excellent" if pct < 15: return "🟡 Acceptable" return "🔴 Too many deep pages" def print_report(analysis: dict) -> None: print("\n" + "═" * 62) print(" SITEMAP STRUCTURE ANALYSIS") print("═" * 62) print(f"\n Total URLs: {analysis['total_urls']}") print("\n── Depth Distribution ──") dist = analysis["depth_distribution"] total = analysis["total_urls"] for depth, count in sorted(dist.items()): pct = count / total * 100 if total else 0 bar = "█" * int(pct / 2) label = "homepage" if depth == 0 else f"{' ' * min(depth, 3)}/{'…/' * (depth - 1)}page" print(f" Depth {depth}: {count:4d} pages ({pct:5.1f}%) {bar} {label}") print(f"\n Rating: {grade_depth_distribution(dist)}") deep_pct = sum(v for k, v in dist.items() if k >= 4) / total * 100 if total else 0 if deep_pct >= 5: print(" ⚠️ More than 5% of pages are 4+ levels deep.") print(" Consider flattening structure or adding shortcut links.") print("\n── Top-Level Directories ──") for d, count in analysis["top_directories"].items(): pct = count / total * 100 if total else 0 print(f" /{d:<30s} {count:4d} URLs ({pct:.1f}%)") print("\n── URL Pattern Analysis ──") for p in analysis["top_url_patterns"]: print(f" {p['pattern']:<45s} {p['count']:4d} URLs") if analysis["dynamic_urls"]: print(f"\n── Dynamic URLs Detected ({len(analysis['dynamic_urls'])}) ──") print(" ⚠️ URLs with query parameters should usually be excluded from sitemap.") print(" Use canonical tags or robots.txt to prevent duplicate content indexing.") for u in analysis["dynamic_urls"][:5]: print(f" {u}") if len(analysis["dynamic_urls"]) > 5: print(f" ... and {len(analysis['dynamic_urls']) - 5} more") if analysis["deep_pages"]: print(f"\n── Deep Pages (4+ Levels) ({len(analysis['deep_pages'])}) ──") print(" ⚠️ Pages this deep may have weak crawl equity. Add internal shortcuts.") for url, depth in analysis["deep_pages"][:5]: print(f" Depth {depth}: {url}") if len(analysis["deep_pages"]) > 5: print(f" ... and {len(analysis['deep_pages']) - 5} more") if analysis["duplicate_slug_candidates"]: print(f"\n── Potential Duplicate Path Issues ({len(analysis['duplicate_slug_candidates'])}) ──") print(" ⚠️ Same slug appears in multiple directories — possible duplicate content.") for item in analysis["duplicate_slug_candidates"][:5]: print(f" Slug: '{item['slug']}'") for u in item["urls"]: print(f" - {u}") if len(analysis["duplicate_slug_candidates"]) > 5: print(f" ... and {len(analysis['duplicate_slug_candidates']) - 5} more") print("\n── Recommendations ──") has_issues = False if analysis["dynamic_urls"]: print(" 1. Remove dynamic URLs (with ?) from sitemap.") has_issues = True if analysis["deep_pages"]: print(f" {'2' if has_issues else '1'}. Flatten deep URL structures or add internal shortcut links.") has_issues = True if analysis["duplicate_slug_candidates"]: print(f" {'3' if has_issues else '1'}. Review duplicate slug paths — consolidate or add canonical tags.") has_issues = True if not has_issues: print(" ✅ No major structural issues detected in this sitemap.") print("\n" + "═" * 62) # ─── Main ───────────────────────────────────────────────────────────────────── def load_content(source: str) -> str: """Load sitemap from file path, URL, or stdin.""" if source.startswith("http://") or source.startswith("https://"): try: with urllib.request.urlopen(source, timeout=10) as resp: return resp.read().decode("utf-8") except urllib.error.URLError as e: print(f"Error fetching URL: {e}", file=sys.stderr) sys.exit(1) else: try: with open(source, "r", encoding="utf-8") as f: return f.read() except FileNotFoundError: print(f"Error: File not found: {source}", file=sys.stderr) sys.exit(1) def main(): import argparse parser = argparse.ArgumentParser( description="Analyzes sitemap.xml files for structure, depth, and potential issues. " "Reports depth distribution, URL patterns, orphan candidates, and duplicates." ) parser.add_argument( "file", nargs="?", default=None, help="Path to a sitemap.xml file or URL (https://...). " "Use '-' to read from stdin. If omitted, runs embedded sample." ) args = parser.parse_args() if args.file: if args.file == "-": content = sys.stdin.read() else: content = load_content(args.file) else: print("No file or URL provided — running on embedded sample sitemap.\n") content = SAMPLE_SITEMAP urls = parse_sitemap(content) if not urls: print("No URLs found in sitemap.", file=sys.stderr) sys.exit(1) analysis = analyze_urls(urls) print_report(analysis) # JSON output print("\n── JSON Summary ──") summary = { "total_urls": analysis["total_urls"], "depth_distribution": analysis["depth_distribution"], "dynamic_url_count": len(analysis["dynamic_urls"]), "deep_page_count": len(analysis["deep_pages"]), "duplicate_slug_count": len(analysis["duplicate_slug_candidates"]), "top_directories": analysis["top_directories"], } print(json.dumps(summary, indent=2)) if __name__ == "__main__": main()