use crate::error::OsfError;
use crate::model::*;
use quick_xml::events::Event;
use quick_xml::reader::Reader;
use std::io::Read;
pub fn parse(data: &[u8]) -> Result<OsfDocument, OsfError> {
if let Ok(xml) = extract_xml_from_zip(data) {
return parse_xml(&xml);
}
let xml = String::from_utf8_lossy(data);
if xml.contains("Open Screenplay Format") {
return parse_xml(&xml);
}
Err(OsfError::NotOsf)
}
fn extract_xml_from_zip(data: &[u8]) -> Result<String, OsfError> {
let cursor = std::io::Cursor::new(data);
let mut archive = zip::ZipArchive::new(cursor)?;
for i in 0..archive.len() {
let mut file = archive.by_index(i)?;
if file.name().ends_with("document.xml") || file.name().ends_with(".xml") {
let mut contents = String::new();
file.read_to_string(&mut contents)?;
return Ok(contents);
}
}
Err(OsfError::MissingDocument)
}
fn parse_xml(xml: &str) -> Result<OsfDocument, OsfError> {
let mut reader = Reader::from_str(xml);
let mut raw_version = 0u32;
let mut uuid = None;
let mut page_count = None;
let mut characters = Vec::new();
let mut locations = Vec::new();
let mut info_title: Option<String> = None;
let mut info_written_by: Option<String> = None;
let mut info_contact: Option<String> = None;
let mut info_drafts: Option<String> = None;
let mut info_copyright: Option<String> = None;
let mut paragraphs = Vec::new();
let mut titlepage_paras = Vec::new();
let mut in_paragraphs = false;
let mut in_titlepage = false;
let mut in_lists = false;
let mut in_text = false;
let mut current_style = String::new();
let mut current_text = String::new();
let mut current_page: Option<u32> = None;
let mut current_scene_number: Option<String> = None;
let mut current_bookmark: Option<String> = None;
let mut current_dual_dialogue = false;
loop {
match reader.read_event() {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
match tag.as_str() {
"document" => {
for attr in e.attributes().flatten() {
if attr_key(&attr) == "version" {
raw_version = attr_val(&attr).parse().unwrap_or(0);
}
}
}
"info" => {
for attr in e.attributes().flatten() {
let key = attr_key(&attr);
let val = attr_val(&attr);
match key.as_str() {
"uuid" => uuid = Some(val),
"pagecount" | "pageCount" => {
page_count = val.parse().ok();
}
"title" => info_title = Some(val),
"written_by" => info_written_by = Some(val),
"contact" => {
if !val.is_empty() {
info_contact = Some(val);
}
}
"drafts" => info_drafts = Some(val),
"copyright" => info_copyright = Some(val),
_ => {}
}
}
}
"paragraphs" => in_paragraphs = true,
"titlepage" => in_titlepage = true,
"lists" => in_lists = true,
"para" => {
current_style.clear();
current_text.clear();
current_page = None;
current_scene_number = None;
current_bookmark = None;
current_dual_dialogue = false;
for attr in e.attributes().flatten() {
let key = attr_key(&attr);
let val = attr_val(&attr);
match key.as_str() {
"page_number" | "pageNumber" => {
current_page = val.parse().ok();
}
"number" | "sceneNumber" => {
current_scene_number = Some(val);
}
"bookmark" => {
current_bookmark = Some(val);
}
_ => {}
}
}
}
"style" => {
for attr in e.attributes().flatten() {
let key = attr_key(&attr);
if key == "basestyle"
|| key == "baseStyleName"
|| key == "basestylename"
{
current_style = attr_val(&attr);
}
if key == "dualdialogue" && attr_val(&attr) == "1" {
current_dual_dialogue = true;
}
}
}
"text" => in_text = true,
"character" if in_lists => {
for attr in e.attributes().flatten() {
if attr_key(&attr) == "name" {
characters.push(attr_val(&attr));
}
}
}
"location" if in_lists => {
for attr in e.attributes().flatten() {
if attr_key(&attr) == "name" {
locations.push(attr_val(&attr));
}
}
}
_ => {}
}
}
Ok(Event::Text(e)) => {
if in_text {
let txt = e.unescape().unwrap_or_default().to_string();
current_text.push_str(&txt);
}
}
Ok(Event::End(e)) => {
let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
match tag.as_str() {
"text" => in_text = false,
"para" => {
let para = Paragraph {
style: ParaStyle::from_name(¤t_style),
text: current_text.clone(),
page: current_page,
scene_number: current_scene_number.clone(),
bookmark: current_bookmark.clone(),
dual_dialogue: current_dual_dialogue,
};
if in_titlepage {
titlepage_paras.push(para);
} else if in_paragraphs {
paragraphs.push(para);
}
}
"paragraphs" => in_paragraphs = false,
"titlepage" => in_titlepage = false,
"lists" => in_lists = false,
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(OsfError::Xml(format!("{e}"))),
_ => {}
}
}
if raw_version == 0 && paragraphs.is_empty() {
return Err(OsfError::NotOsf);
}
let version = OsfVersion::from_raw(raw_version);
let mut title_page = build_title_page(&titlepage_paras);
if title_page.title.is_none() {
if let Some(t) = info_title {
if t != "Untitled" {
title_page.title = Some(t);
}
}
}
if title_page.authors.is_empty() {
if let Some(w) = info_written_by {
if w != "Written by" && !w.is_empty() {
title_page.authors.push(w);
}
}
}
if title_page.draft.is_none() {
title_page.draft = info_drafts;
}
if title_page.contact.is_none() {
title_page.contact = info_contact;
}
if title_page.copyright.is_none() {
title_page.copyright = info_copyright;
}
let (scenes, raw_text) = build_scenes(¶graphs);
Ok(OsfDocument {
version,
uuid,
page_count,
title_page,
scenes,
characters,
locations,
raw_text,
})
}
fn build_title_page(paras: &[Paragraph]) -> TitlePage {
let mut tp = TitlePage::default();
for para in paras {
let text = para.text.trim();
if text.is_empty() {
continue;
}
match para.bookmark.as_deref() {
Some("Title") => tp.title = Some(text.to_string()),
Some("Author") => tp.authors.push(text.to_string()),
Some("Draft") => tp.draft = Some(text.to_string()),
Some("Contact") => tp.contact = Some(text.to_string()),
Some("Copyright") => tp.copyright = Some(text.to_string()),
_ => {
if tp.title.is_none()
&& para.style == ParaStyle::Other("Title".to_string())
{
tp.title = Some(text.to_string());
}
}
}
}
tp
}
fn build_scenes(paras: &[Paragraph]) -> (Vec<Scene>, String) {
let mut scenes = Vec::new();
let mut raw_text = String::new();
let mut scene_count = 0usize;
let mut current_heading = String::new();
let mut current_body = String::new();
let mut current_page: Option<u32> = None;
let mut current_scene_num: Option<String> = None;
let mut in_scene = false;
for para in paras {
let formatted = format_paragraph(para);
if para.style == ParaStyle::SceneHeading {
if in_scene {
scene_count += 1;
let number = current_scene_num
.as_deref()
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(scene_count);
scenes.push(Scene {
number,
heading: current_heading.clone(),
page: current_page,
body: current_body.trim().to_string(),
});
}
current_heading = para.text.clone();
current_body = String::new();
current_page = para.page;
current_scene_num = para.scene_number.clone();
in_scene = true;
raw_text.push_str(&formatted);
} else {
if in_scene {
current_body.push_str(&formatted);
}
raw_text.push_str(&formatted);
}
}
if in_scene {
scene_count += 1;
let number = current_scene_num
.as_deref()
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(scene_count);
scenes.push(Scene {
number,
heading: current_heading,
page: current_page,
body: current_body.trim().to_string(),
});
}
(scenes, raw_text)
}
fn format_paragraph(para: &Paragraph) -> String {
if para.text.is_empty() {
return "\n".to_string();
}
match para.style {
ParaStyle::SceneHeading => format!("\n{}\n", para.text),
ParaStyle::Character => format!("\n {}\n", para.text),
ParaStyle::Parenthetical => format!(" {}\n", para.text),
ParaStyle::Dialogue => format!(" {}\n", para.text),
ParaStyle::Transition => format!("\n{}\n", para.text),
_ => format!("{}\n", para.text),
}
}
fn attr_key(attr: &quick_xml::events::attributes::Attribute) -> String {
String::from_utf8_lossy(attr.key.as_ref()).to_string()
}
fn attr_val(attr: &quick_xml::events::attributes::Attribute) -> String {
String::from_utf8_lossy(&attr.value).to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_v4_parse() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<document type="Open Screenplay Format document" version="40">
<info uuid="test-uuid" pagecount="1"/>
<settings/><styles/>
<paragraphs>
<para page_number="1">
<style basestyle="Scene Heading"/>
<text>INT. COFFEE SHOP - DAY</text>
</para>
<para page_number="1">
<style basestyle="Action"/>
<text>A busy morning.</text>
</para>
<para page_number="1">
<style basestyle="Character"/>
<text>JOHN</text>
</para>
<para page_number="1">
<style basestyle="Dialogue"/>
<text>I need coffee.</text>
</para>
</paragraphs>
<titlepage>
<para bookmark="Title"><style basestyle="Action"/><text>My Screenplay</text></para>
<para bookmark="Author"><style basestyle="Action"/><text>Jane Doe</text></para>
</titlepage>
<lists>
<characters><character name="JOHN"/></characters>
<locations><location name="Coffee Shop"/></locations>
</lists>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert_eq!(doc.version, OsfVersion::V4);
assert_eq!(doc.uuid.as_deref(), Some("test-uuid"));
assert_eq!(doc.title_page.title.as_deref(), Some("My Screenplay"));
assert_eq!(doc.title_page.authors, vec!["Jane Doe"]);
assert_eq!(doc.scenes.len(), 1);
assert_eq!(doc.scenes[0].heading, "INT. COFFEE SHOP - DAY");
assert!(doc.scenes[0].body.contains("JOHN"));
assert!(doc.scenes[0].body.contains("I need coffee."));
assert_eq!(doc.characters, vec!["JOHN"]);
assert_eq!(doc.locations, vec!["Coffee Shop"]);
}
#[test]
fn test_v2_attributes() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<document type="Open Screenplay Format document" version="21">
<info uuid="v2-test" pageCount="2"/>
<styles/>
<paragraphs>
<para pageNumber="1" sceneNumber="1">
<style baseStyleName="Scene Heading"/>
<text>EXT. PARK - NIGHT</text>
</para>
<para pageNumber="1">
<style baseStyleName="Action"/>
<text>Moonlight.</text>
</para>
</paragraphs>
<titlepage/><lists/>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert_eq!(doc.version, OsfVersion::V2);
assert_eq!(doc.scenes.len(), 1);
assert_eq!(doc.scenes[0].heading, "EXT. PARK - NIGHT");
assert_eq!(doc.scenes[0].number, 1);
}
#[test]
fn test_v1_metadata_in_info() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<document type="Open Screenplay Format document" version="12">
<info title="The Great Script" written_by="John Smith" copyright="Copyright 2025"
contact="john@example.com" drafts="Second Draft" uuid="v1-test" pagecount="50"/>
<settings/><styles/>
<paragraphs>
<para>
<style basestylename="Scene Heading"/>
<text>Int. First Location - day</text>
</para>
<para>
<style basestylename="Action"/>
<text>Action description.</text>
</para>
<para>
<style basestylename="Character"/>
<text>CHARACTER 1</text>
</para>
<para>
<style basestylename="Dialogue"/>
<text>Hello there.</text>
</para>
<para>
<style basestylename="Transition"/>
<text>CUT TO:</text>
</para>
<para>
<style basestylename="Scene Heading"/>
<text>Ext. Second location - night</text>
</para>
<para>
<style basestylename="Action"/>
<text>More action.</text>
</para>
</paragraphs>
<spelling language="en_US"/>
<lists/>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert_eq!(doc.version, OsfVersion::V1);
assert_eq!(doc.title_page.title.as_deref(), Some("The Great Script"));
assert_eq!(doc.title_page.authors, vec!["John Smith"]);
assert_eq!(doc.title_page.draft.as_deref(), Some("Second Draft"));
assert_eq!(doc.title_page.contact.as_deref(), Some("john@example.com"));
assert_eq!(
doc.title_page.copyright.as_deref(),
Some("Copyright 2025")
);
assert_eq!(doc.scenes.len(), 2);
assert_eq!(doc.scenes[0].heading, "Int. First Location - day");
assert_eq!(doc.scenes[1].heading, "Ext. Second location - night");
assert!(doc.scenes[0].body.contains("CHARACTER 1"));
assert!(doc.scenes[0].body.contains("CUT TO:"));
}
#[test]
fn test_multi_text_runs_concatenated() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<document type="Open Screenplay Format document" version="40">
<info/><settings/><styles/>
<paragraphs>
<para page_number="1">
<style basestyle="Scene Heading"/>
<text>INT. OFFICE - DAY</text>
</para>
<para page_number="1">
<style basestyle="Action"/>
<text bold="1">Sarah</text>
<text> walks to the </text>
<text italic="1">window</text>
<text>.</text>
</para>
</paragraphs>
<titlepage/><lists/>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert!(doc.scenes[0].body.contains("Sarah walks to the window."));
}
#[test]
fn test_v1_skips_default_title() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<document type="Open Screenplay Format document" version="12">
<info title="Untitled" written_by="Written by" contact="" drafts="" uuid="x" pagecount="1"/>
<settings/><styles/>
<paragraphs>
<para><style basestylename="Scene Heading"/><text>INT. ROOM - DAY</text></para>
</paragraphs>
<spelling/><lists/>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert!(doc.title_page.title.is_none());
assert!(doc.title_page.authors.is_empty());
}
#[test]
fn test_multiple_scenes_sequential_numbering() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<document type="Open Screenplay Format document" version="40">
<info/><settings/><styles/>
<paragraphs>
<para><style basestyle="Scene Heading"/><text>INT. KITCHEN - DAY</text></para>
<para><style basestyle="Action"/><text>Coffee brews.</text></para>
<para><style basestyle="Scene Heading"/><text>EXT. GARDEN - DAY</text></para>
<para><style basestyle="Action"/><text>Birds sing.</text></para>
<para><style basestyle="Scene Heading"/><text>INT. BEDROOM - NIGHT</text></para>
<para><style basestyle="Action"/><text>Silence.</text></para>
</paragraphs>
<titlepage/><lists/>
</document>"#;
let doc = parse_xml(xml).unwrap();
assert_eq!(doc.scenes.len(), 3);
assert_eq!(doc.scenes[0].number, 1);
assert_eq!(doc.scenes[1].number, 2);
assert_eq!(doc.scenes[2].number, 3);
assert_eq!(doc.scenes[0].heading, "INT. KITCHEN - DAY");
assert_eq!(doc.scenes[2].heading, "INT. BEDROOM - NIGHT");
}
#[test]
fn test_not_osf_returns_error() {
let result = parse_xml("<html><body>Not a screenplay</body></html>");
assert!(result.is_err());
}
}