From ee186eeba39ba78c3878338ae5e193ed7b3ef18f Mon Sep 17 00:00:00 2001 From: Finn Bear Date: Sat, 19 Oct 2024 00:10:40 -0700 Subject: [PATCH] Add more crates to accuracy comparison. --- Cargo.toml | 6 ++++-- Makefile | 3 +++ README.md | 2 ++ src/censor.rs | 23 +++++++++++++++++++++-- src/lib.rs | 4 ++-- 5 files changed, 32 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 80c7b4e..372e44b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,8 +75,10 @@ serde = {version = "1", features=["derive"], optional = true} [dev-dependencies] rand = "0.8" csv = "1.1" -censor_crate = {package = "censor", version = "0.3.0"} -rustrict_old = {package = "rustrict", version = "0.7.24"} +censor_crate = { package = "censor", version = "0.3.0" } +rustrict_old = { package = "rustrict", version = "0.7.24" } serial_test = "0.5" +stfu_crate = { package = "stfu", version = "0.1.0" } +profane_rs_crate = { package = "profane-rs", version = "0.0.4" } bincode = "1.3.3" serde_json = "1" diff --git a/Makefile b/Makefile index 5f70f36..b1df717 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,9 @@ test: compare: COMPARE=1 make test +table: + cargo test --release -- accuracy --nocapture + # Skips accuracy analysis so finishes faster. test_debug: cargo test --features pii -- --nocapture diff --git a/README.md b/README.md index b9f9fc4..2cc56db 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,8 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected |-------|----------|-------------------|-------------------|------| | [rustrict](https://crates.io/crates/rustrict) | 79.83% | 94.00% | 76.30% | 9s | | [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s | +| [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s | +| [profane-rs](https://crates.io/crates/profane-rs) | 80.47% | 73.79% | 82.14% | 52s | ## Development diff --git a/src/censor.rs b/src/censor.rs index 774fe87..1dd305e 100644 --- a/src/censor.rs +++ b/src/censor.rs @@ -1163,6 +1163,18 @@ mod tests { filter.check(s) } + let mut stfu_filter = stfu_crate::types::OwnedFilter::default(); + use stfu_crate::word_lists::severity::{MILD, SEVERE, STRONG}; + stfu_filter.add_slice(&MILD); + stfu_filter.add_slice(&STRONG); + stfu_filter.add_slice(&SEVERE); + + let stfu = |s: &str| -> bool { stfu_filter.filter_string(s).is_some() }; + + fn profane_rs(s: &str) -> bool { + profane_rs_crate::contains_profanity(s, false) + } + println!("| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |"); println!("|-------|----------|-------------------|-------------------|------|"); print_accuracy( @@ -1172,12 +1184,19 @@ mod tests { Some(rustrict_old as fn(&str) -> bool).filter(|_| std::env::var("COMPARE").is_ok()), ); print_accuracy("https://crates.io/crates/censor", censor, false, None); + print_accuracy("https://crates.io/crates/stfu", stfu, false, None); + print_accuracy( + "https://crates.io/crates/profane-rs", + profane_rs, + false, + None, + ); } #[allow(dead_code)] fn print_accuracy( link: &str, - checker: fn(&str) -> bool, + checker: impl Fn(&str) -> bool, find_detections: bool, compare_to: Option bool>, ) { @@ -1196,7 +1215,7 @@ mod tests { #[allow(dead_code)] fn accuracy_of( - checker: fn(&str) -> bool, + checker: impl Fn(&str) -> bool, find_detections: bool, compare_to: Option bool>, ) -> (f32, f32, f32) { diff --git a/src/lib.rs b/src/lib.rs index 41b994d..7d99400 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -53,13 +53,13 @@ pub(crate) type Set = rustc_hash::FxHashSet; #[allow(deprecated)] pub use censor::add_word; +#[cfg(all(feature = "context", feature = "width"))] +pub use context::ContextWordBreakOptions; #[cfg(feature = "context")] pub use context::{ BlockReason, Context, ContextProcessingOptions, ContextRateLimitOptions, ContextRepetitionLimitOptions, }; -#[cfg(all(feature = "context", feature = "width"))] -pub use context::ContextWordBreakOptions; #[cfg(feature = "pii")] pub use pii::censor_and_analyze_pii;