Skip to main content

subcog/git/
parser.rs

1//! YAML front matter parsing.
2//!
3//! Parses and serializes YAML front matter in memory content.
4//! Front matter format:
5//! ```text
6//! ---
7//! namespace: decisions
8//! domain: org/repo
9//! tags: [rust, architecture]
10//! ---
11//! The actual memory content here.
12//! ```
13//!
14//! # Security
15//!
16//! This module includes protections against YAML-based attacks:
17//! - **Size limits**: Front matter limited to 64KB to prevent memory exhaustion
18//! - **Billion laughs**: Entity expansion limited by `serde_yaml_ng`'s safe defaults
19
20use crate::{Error, Result};
21
22/// Maximum allowed size for YAML front matter (64KB).
23/// Prevents memory exhaustion from maliciously large front matter.
24const MAX_FRONT_MATTER_SIZE: usize = 64 * 1024;
25
26/// Parser for YAML front matter in memory content.
27pub struct YamlFrontMatterParser;
28
29impl YamlFrontMatterParser {
30    /// The front matter delimiter.
31    const DELIMITER: &'static str = "---";
32
33    /// Parses YAML front matter from content.
34    ///
35    /// Returns the parsed metadata and remaining content.
36    ///
37    /// # Errors
38    ///
39    /// Returns an error if the YAML is malformed.
40    ///
41    /// # Examples
42    ///
43    /// ```rust
44    /// use subcog::git::YamlFrontMatterParser;
45    ///
46    /// let content = "---\nnamespace: decisions\n---\nActual content";
47    /// let (metadata, body) = YamlFrontMatterParser::parse(content).unwrap();
48    /// assert_eq!(metadata["namespace"], "decisions");
49    /// assert_eq!(body, "Actual content");
50    /// ```
51    pub fn parse(content: &str) -> Result<(serde_json::Value, String)> {
52        let content = content.trim_start();
53
54        // Check if content starts with front matter delimiter
55        if !content.starts_with(Self::DELIMITER) {
56            // No front matter, return empty metadata and original content
57            return Ok((
58                serde_json::Value::Object(serde_json::Map::new()),
59                content.to_string(),
60            ));
61        }
62
63        // Find the end of front matter
64        let after_first = &content[Self::DELIMITER.len()..];
65        let after_first = after_first.trim_start_matches(['\r', '\n']);
66
67        if let Some(end_pos) = after_first.find(Self::DELIMITER) {
68            let yaml_content = &after_first[..end_pos].trim();
69
70            // PEN-H3: Prevent billion laughs attack by limiting front matter size
71            if yaml_content.len() > MAX_FRONT_MATTER_SIZE {
72                return Err(Error::InvalidInput(format!(
73                    "YAML front matter exceeds maximum size of {MAX_FRONT_MATTER_SIZE} bytes",
74                )));
75            }
76
77            let body_start = end_pos + Self::DELIMITER.len();
78            let body = after_first[body_start..].trim_start_matches(['\r', '\n']);
79
80            // Parse YAML to serde_json::Value
81            // Note: serde_yaml_ng uses safe-yaml crate which limits entity expansion
82            let metadata: serde_json::Value = serde_yaml_ng::from_str(yaml_content)
83                .map_err(|e| Error::InvalidInput(format!("Invalid YAML front matter: {e}")))?;
84
85            Ok((metadata, body.to_string()))
86        } else {
87            // No closing delimiter found
88            Err(Error::InvalidInput(
89                "Front matter missing closing delimiter".to_string(),
90            ))
91        }
92    }
93
94    /// Serializes metadata to YAML front matter format.
95    ///
96    /// # Errors
97    ///
98    /// Returns an error if serialization fails.
99    ///
100    /// # Examples
101    ///
102    /// ```rust
103    /// use subcog::git::YamlFrontMatterParser;
104    /// use serde_json::json;
105    ///
106    /// let metadata = json!({"namespace": "decisions"});
107    /// let result = YamlFrontMatterParser::serialize(&metadata, "Content here").unwrap();
108    /// assert!(result.contains("---"));
109    /// assert!(result.contains("namespace: decisions"));
110    /// assert!(result.contains("Content here"));
111    /// ```
112    pub fn serialize(metadata: &serde_json::Value, content: &str) -> Result<String> {
113        // If metadata is empty, just return content
114        if metadata.is_null()
115            || (metadata.is_object() && metadata.as_object().is_some_and(serde_json::Map::is_empty))
116        {
117            return Ok(content.to_string());
118        }
119
120        let yaml = serde_yaml_ng::to_string(metadata).map_err(|e| Error::OperationFailed {
121            operation: "serialize_yaml".to_string(),
122            cause: e.to_string(),
123        })?;
124
125        Ok(format!(
126            "{}\n{}{}\n{}",
127            Self::DELIMITER,
128            yaml,
129            Self::DELIMITER,
130            content
131        ))
132    }
133
134    /// Extracts just the body content without parsing metadata.
135    #[must_use]
136    pub fn extract_body(content: &str) -> &str {
137        let content = content.trim_start();
138
139        if !content.starts_with(Self::DELIMITER) {
140            return content;
141        }
142
143        let after_first = &content[Self::DELIMITER.len()..];
144        let after_first = after_first.trim_start_matches(['\r', '\n']);
145
146        after_first
147            .find(Self::DELIMITER)
148            .map_or(content, |end_pos| {
149                let body_start = end_pos + Self::DELIMITER.len();
150                after_first[body_start..].trim_start_matches(['\r', '\n'])
151            })
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158    use serde_json::json;
159
160    #[test]
161    fn test_parse_with_front_matter() {
162        let content = "---\nnamespace: decisions\ntags:\n  - rust\n  - arch\n---\nThe content.";
163        let (metadata, body) = YamlFrontMatterParser::parse(content).unwrap();
164
165        assert_eq!(metadata["namespace"], "decisions");
166        assert_eq!(metadata["tags"][0], "rust");
167        assert_eq!(metadata["tags"][1], "arch");
168        assert_eq!(body, "The content.");
169    }
170
171    #[test]
172    fn test_parse_without_front_matter() {
173        let content = "Just plain content";
174        let (metadata, body) = YamlFrontMatterParser::parse(content).unwrap();
175
176        assert!(metadata.is_object());
177        assert!(metadata.as_object().unwrap().is_empty());
178        assert_eq!(body, "Just plain content");
179    }
180
181    #[test]
182    fn test_parse_missing_closing_delimiter() {
183        let content = "---\nnamespace: test\nNo closing delimiter";
184        let result = YamlFrontMatterParser::parse(content);
185        assert!(result.is_err());
186    }
187
188    #[test]
189    fn test_serialize() {
190        let metadata = json!({
191            "namespace": "learnings",
192            "domain": "zircote/subcog"
193        });
194        let content = "Learning about Rust";
195        let result = YamlFrontMatterParser::serialize(&metadata, content).unwrap();
196
197        assert!(result.starts_with("---"));
198        assert!(result.contains("namespace: learnings"));
199        assert!(result.contains("domain: zircote/subcog"));
200        assert!(result.ends_with("Learning about Rust"));
201    }
202
203    #[test]
204    fn test_serialize_empty_metadata() {
205        let metadata = json!({});
206        let content = "Just content";
207        let result = YamlFrontMatterParser::serialize(&metadata, content).unwrap();
208        assert_eq!(result, "Just content");
209    }
210
211    #[test]
212    fn test_extract_body() {
213        let content = "---\nfoo: bar\n---\nThe body";
214        assert_eq!(YamlFrontMatterParser::extract_body(content), "The body");
215
216        let plain = "No front matter";
217        assert_eq!(
218            YamlFrontMatterParser::extract_body(plain),
219            "No front matter"
220        );
221    }
222
223    #[test]
224    fn test_roundtrip() {
225        let original_meta = json!({
226            "namespace": "decisions",
227            "tags": ["a", "b"]
228        });
229        let original_body = "Decision content";
230
231        let serialized = YamlFrontMatterParser::serialize(&original_meta, original_body).unwrap();
232        let (parsed_meta, parsed_body) = YamlFrontMatterParser::parse(&serialized).unwrap();
233
234        assert_eq!(parsed_meta["namespace"], original_meta["namespace"]);
235        assert_eq!(parsed_body, original_body);
236    }
237}