diff --git a/Cargo.toml b/Cargo.toml index c0ecab7..9a16e85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ syntect-tui = "3.0" notify = "6.1" ansi-to-tui = "8.0" walkdir = "2.5" +ureq = "2.12" diff --git a/src/config.rs b/src/config.rs index 67c1505..c1ff689 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,6 +12,12 @@ pub struct Config { #[serde(default = "default_theme")] #[allow(dead_code)] pub theme: String, + + #[serde(default)] + pub margin: u16, + + #[serde(default)] + pub allowed_remote_domains: Vec, } fn default_vault_path() -> PathBuf { @@ -27,6 +33,8 @@ impl Default for Config { Config { vault_path: default_vault_path(), theme: default_theme(), + margin: 0, + allowed_remote_domains: Vec::new(), } } } diff --git a/src/vault.rs b/src/vault.rs index 5337d36..362efe4 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -1,5 +1,6 @@ -use std::io::{self, BufRead}; +use std::io::{self, BufRead, Read}; use std::path::{Path, PathBuf}; +use std::time::Duration; use walkdir::WalkDir; // ── VaultDocument ───────────────────────────────────────────────────────────── @@ -189,6 +190,150 @@ pub fn resolve_standard_link(vault_path: &Path, current_doc: &str, dest: &str) - None } +// ── Remote document fetching ────────────────────────────────────────────────── + +/// Result of attempting to fetch a remote markdown document. +pub enum RemoteDocument { + /// Successfully fetched and content appears to be markdown. + Loaded { url: String, content: String }, + /// Domain is not in the whitelist. + DomainNotAllowed { domain: String }, + /// HTTP request failed (network error, timeout, non-2xx status). + FetchError { url: String, reason: String }, + /// Response content does not appear to be markdown (e.g. HTML page, binary). + NotMarkdown { url: String, content_type: String }, +} + +/// Extract the domain from a URL string. +/// +/// Examples: +/// - `https://example.com/foo` → `example.com` +/// - `https://sub.example.com:8080/bar` → `sub.example.com` +fn extract_domain(url: &str) -> Option { + // Split off the scheme: "https://example.com/..." → "example.com/..." + let after_scheme = url.splitn(2, "://").nth(1)?; + // Take everything before the first '/' + let host_port = after_scheme.split('/').next()?; + // Strip port number if present (last ':' only if it looks like a port) + let domain = match host_port.rfind(':') { + Some(i) => { + let port_part = &host_port[i + 1..]; + if port_part.chars().all(|c| c.is_ascii_digit()) { + &host_port[..i] + } else { + host_port + } + } + None => host_port, + }; + Some(domain.to_lowercase()) +} + +/// Check whether `domain` is permitted by the `allowed_domains` list. +/// +/// Matching rules (case-insensitive): +/// - Exact match: `example.com` matches `example.com` +/// - Subdomain match: `sub.example.com` matches whitelist entry `example.com` +fn domain_is_allowed(domain: &str, allowed_domains: &[String]) -> bool { + for allowed in allowed_domains { + let allowed_lower = allowed.to_lowercase(); + if domain == allowed_lower { + return true; + } + // Allow subdomains: "sub.example.com" matches "example.com" + let suffix = format!(".{}", allowed_lower); + if domain.ends_with(&suffix) { + return true; + } + } + false +} + +/// Fetch a remote markdown document from `url` and validate its content. +/// +/// Steps: +/// 1. Extract domain from URL and check against `allowed_domains` whitelist. +/// 2. Issue a GET request with a 10-second timeout via ureq. +/// 3. Validate the Content-Type header (accept markdown/plain text; reject HTML/binary). +/// 4. Read the response body (capped at 5 MB to prevent memory exhaustion). +/// 5. Return the appropriate `RemoteDocument` variant. +pub fn fetch_remote_markdown(url: &str, allowed_domains: &[String]) -> RemoteDocument { + // Step 1: Domain whitelist check + let domain = match extract_domain(url) { + Some(d) => d, + None => { + return RemoteDocument::FetchError { + url: url.to_string(), + reason: "Could not parse domain from URL".to_string(), + }; + } + }; + + if !domain_is_allowed(&domain, allowed_domains) { + return RemoteDocument::DomainNotAllowed { domain }; + } + + // Step 2: HTTP GET with timeout + let response = match ureq::get(url) + .timeout(Duration::from_secs(10)) + .call() + { + Ok(resp) => resp, + Err(ureq::Error::Status(code, resp)) => { + return RemoteDocument::FetchError { + url: url.to_string(), + reason: format!("HTTP {} {}", code, resp.status_text().to_string()), + }; + } + Err(e) => { + return RemoteDocument::FetchError { + url: url.to_string(), + reason: e.to_string(), + }; + } + }; + + // Step 3: Content-Type validation + let content_type = response + .header("Content-Type") + .unwrap_or("application/octet-stream") + .to_lowercase(); + + // Strip parameters like "; charset=utf-8" + let ct_base = content_type.split(';').next().unwrap_or("").trim().to_string(); + + let url_path_is_md = url.split('?').next().unwrap_or(url).ends_with(".md"); + + let is_acceptable = matches!( + ct_base.as_str(), + "text/markdown" | "text/plain" | "text/x-markdown" + ) || url_path_is_md; + + let is_html = ct_base == "text/html"; + + if is_html || (!is_acceptable && !url_path_is_md) { + return RemoteDocument::NotMarkdown { + url: url.to_string(), + content_type: ct_base, + }; + } + + // Step 4: Read body with 5 MB limit + let mut body = String::new(); + let mut reader = response.into_reader().take(5_000_000); + if let Err(e) = reader.read_to_string(&mut body) { + return RemoteDocument::FetchError { + url: url.to_string(), + reason: format!("Failed to read response body: {}", e), + }; + } + + RemoteDocument::Loaded { + url: url.to_string(), + content: body, + } +} + // ── Directory listing ───────────────────────────────────────────────────────── /// Entry in the vault directory listing.