feat(quick-3): add ureq dep, allowed_remote_domains config, and fetch_remote_markdown

- Add ureq = "2.12" to Cargo.toml for synchronous HTTP fetching - Add allowed_remote_domains: Vec<String> field to Config struct with serde default - Add RemoteDocument enum with Loaded/DomainNotAllowed/FetchError/NotMarkdown variants - Add fetch_remote_markdown() with domain whitelist, 10s timeout, content-type validation, 5MB body limit
2026-03-01 13:12:27 +01:00
parent eb1c7866ce
commit 5759ec83e6
3 changed files with 155 additions and 1 deletions
@@ -15,3 +15,4 @@ syntect-tui = "3.0"
 notify = "6.1"
 ansi-to-tui = "8.0"
 walkdir = "2.5"
+ureq = "2.12"
@@ -12,6 +12,12 @@ pub struct Config {
    #[serde(default = "default_theme")]
    #[allow(dead_code)]
    pub theme: String,
+
+    #[serde(default)]
+    pub margin: u16,
+
+    #[serde(default)]
+    pub allowed_remote_domains: Vec<String>,
 }

 fn default_vault_path() -> PathBuf {
@@ -27,6 +33,8 @@ impl Default for Config {
        Config {
            vault_path: default_vault_path(),
            theme: default_theme(),
+            margin: 0,
+            allowed_remote_domains: Vec::new(),
        }
    }
 }
@@ -1,5 +1,6 @@
-use std::io::{self, BufRead};
+use std::io::{self, BufRead, Read};
 use std::path::{Path, PathBuf};
+use std::time::Duration;
 use walkdir::WalkDir;

 // ── VaultDocument ─────────────────────────────────────────────────────────────
@@ -189,6 +190,150 @@ pub fn resolve_standard_link(vault_path: &Path, current_doc: &str, dest: &str) -
    None
 }

+// ── Remote document fetching ──────────────────────────────────────────────────
+
+/// Result of attempting to fetch a remote markdown document.
+pub enum RemoteDocument {
+    /// Successfully fetched and content appears to be markdown.
+    Loaded { url: String, content: String },
+    /// Domain is not in the whitelist.
+    DomainNotAllowed { domain: String },
+    /// HTTP request failed (network error, timeout, non-2xx status).
+    FetchError { url: String, reason: String },
+    /// Response content does not appear to be markdown (e.g. HTML page, binary).
+    NotMarkdown { url: String, content_type: String },
+}
+
+/// Extract the domain from a URL string.
+///
+/// Examples:
+/// - `https://example.com/foo` → `example.com`
+/// - `https://sub.example.com:8080/bar` → `sub.example.com`
+fn extract_domain(url: &str) -> Option<String> {
+    // Split off the scheme: "https://example.com/..." → "example.com/..."
+    let after_scheme = url.splitn(2, "://").nth(1)?;
+    // Take everything before the first '/'
+    let host_port = after_scheme.split('/').next()?;
+    // Strip port number if present (last ':' only if it looks like a port)
+    let domain = match host_port.rfind(':') {
+        Some(i) => {
+            let port_part = &host_port[i + 1..];
+            if port_part.chars().all(|c| c.is_ascii_digit()) {
+                &host_port[..i]
+            } else {
+                host_port
+            }
+        }
+        None => host_port,
+    };
+    Some(domain.to_lowercase())
+}
+
+/// Check whether `domain` is permitted by the `allowed_domains` list.
+///
+/// Matching rules (case-insensitive):
+/// - Exact match: `example.com` matches `example.com`
+/// - Subdomain match: `sub.example.com` matches whitelist entry `example.com`
+fn domain_is_allowed(domain: &str, allowed_domains: &[String]) -> bool {
+    for allowed in allowed_domains {
+        let allowed_lower = allowed.to_lowercase();
+        if domain == allowed_lower {
+            return true;
+        }
+        // Allow subdomains: "sub.example.com" matches "example.com"
+        let suffix = format!(".{}", allowed_lower);
+        if domain.ends_with(&suffix) {
+            return true;
+        }
+    }
+    false
+}
+
+/// Fetch a remote markdown document from `url` and validate its content.
+///
+/// Steps:
+/// 1. Extract domain from URL and check against `allowed_domains` whitelist.
+/// 2. Issue a GET request with a 10-second timeout via ureq.
+/// 3. Validate the Content-Type header (accept markdown/plain text; reject HTML/binary).
+/// 4. Read the response body (capped at 5 MB to prevent memory exhaustion).
+/// 5. Return the appropriate `RemoteDocument` variant.
+pub fn fetch_remote_markdown(url: &str, allowed_domains: &[String]) -> RemoteDocument {
+    // Step 1: Domain whitelist check
+    let domain = match extract_domain(url) {
+        Some(d) => d,
+        None => {
+            return RemoteDocument::FetchError {
+                url: url.to_string(),
+                reason: "Could not parse domain from URL".to_string(),
+            };
+        }
+    };
+
+    if !domain_is_allowed(&domain, allowed_domains) {
+        return RemoteDocument::DomainNotAllowed { domain };
+    }
+
+    // Step 2: HTTP GET with timeout
+    let response = match ureq::get(url)
+        .timeout(Duration::from_secs(10))
+        .call()
+    {
+        Ok(resp) => resp,
+        Err(ureq::Error::Status(code, resp)) => {
+            return RemoteDocument::FetchError {
+                url: url.to_string(),
+                reason: format!("HTTP {} {}", code, resp.status_text().to_string()),
+            };
+        }
+        Err(e) => {
+            return RemoteDocument::FetchError {
+                url: url.to_string(),
+                reason: e.to_string(),
+            };
+        }
+    };
+
+    // Step 3: Content-Type validation
+    let content_type = response
+        .header("Content-Type")
+        .unwrap_or("application/octet-stream")
+        .to_lowercase();
+
+    // Strip parameters like "; charset=utf-8"
+    let ct_base = content_type.split(';').next().unwrap_or("").trim().to_string();
+
+    let url_path_is_md = url.split('?').next().unwrap_or(url).ends_with(".md");
+
+    let is_acceptable = matches!(
+        ct_base.as_str(),
+        "text/markdown" | "text/plain" | "text/x-markdown"
+    ) || url_path_is_md;
+
+    let is_html = ct_base == "text/html";
+
+    if is_html || (!is_acceptable && !url_path_is_md) {
+        return RemoteDocument::NotMarkdown {
+            url: url.to_string(),
+            content_type: ct_base,
+        };
+    }
+
+    // Step 4: Read body with 5 MB limit
+    let mut body = String::new();
+    let mut reader = response.into_reader().take(5_000_000);
+    if let Err(e) = reader.read_to_string(&mut body) {
+        return RemoteDocument::FetchError {
+            url: url.to_string(),
+            reason: format!("Failed to read response body: {}", e),
+        };
+    }
+
+    RemoteDocument::Loaded {
+        url: url.to_string(),
+        content: body,
+    }
+}
+
 // ── Directory listing ─────────────────────────────────────────────────────────

 /// Entry in the vault directory listing.