parsoid 0.10.1

Wrapper around Parsoid HTML that provides convenient accessors for processing and manipulation
Documentation
/*
Copyright (C) 2020-2021 Kunal Mehta <legoktm@debian.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
//! Iterate through the first 500 featured articles and run through our
//! processing code.

use anyhow::Result;
use parsoid::prelude::*;
use std::fs;

fn main() -> Result<()> {
    let mut entries = fs::read_dir("corpus/")?
        .filter_map(|entry| {
            let entry = entry.unwrap();
            if entry.file_name().to_str().unwrap().ends_with(".html") {
                Some(entry.path())
            } else {
                None
            }
        })
        .collect::<Vec<_>>();
    entries.sort();
    for entry in entries {
        let html = fs::read_to_string(entry)?;
        let code = Wikicode::new(&html);
        let serialized = code.to_string();
        // Iterate through all nodes
        let nodes: Vec<_> = code.descendants().collect();
        let mut count = 0;
        for template in code.filter_templates()? {
            count += 1;
            let name = "_parsoid-rs testing";
            template.set_param(name, "foo bar baz")?;
            template.remove_param(name)?;
        }

        let links = code.filter_links();
        for link in &links {
            // Run through these codepaths for every link
            link.set_target(&link.target());
        }
        let extlinks = code.filter_external_links();
        for extlink in &extlinks {
            // Run through these codepaths for every link
            extlink.set_target(&extlink.target());
        }

        for category in code.filter_categories() {
            let cat = category.category();
            category.set_category(&cat);
            // TODO: this is dirty on [[1 − 2 + 3 − 4 + ⋯]]
            // let key = category.sort_key();
            // category.set_sort_key(key.as_deref());
        }
        let new_serialized = code.to_string();
        if serialized != new_serialized {
            let ser_code = Wikicode::new(&serialized);
            let ser_temp = ser_code.filter_templates()?;
            let new_code = Wikicode::new(&new_serialized);
            let new_temp = new_code.filter_templates()?;
            for (ser, new) in ser_temp.iter().zip(new_temp.iter()) {
                let ser_node = ser
                    // relying that node[0] has the data-mw attribute
                    .as_nodes()[0]
                    .as_element()
                    .unwrap()
                    .attributes
                    .borrow()
                    .get("data-mw")
                    .unwrap()
                    .to_string();
                let new_node = new.as_nodes()[0]
                    .as_element()
                    .unwrap()
                    .attributes
                    .borrow()
                    .get("data-mw")
                    .unwrap()
                    .to_string();
                if ser_node != new_node {
                    println!("{ser_node}");
                    println!("---");
                    println!("{new_node}");
                    println!("---");
                }
            }
            fs::write("old.txt", serialized)?;
            fs::write("new.txt", new_serialized)?;
            panic!("No match {}", &code.title().unwrap());
        }
        let link_count = links.len();
        println!(
            "{}: {} templates, {} links, {} nodes",
            &code.title().unwrap(),
            count,
            link_count,
            nodes.len()
        );
    }
    Ok(())
}