diff --git a/Cargo.toml b/Cargo.toml index 1e195d1..898f242 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustrict" authors = ["Finn Bear"] -version = "0.7.18" +version = "0.7.19" edition = "2021" license = "MIT OR Apache-2.0" repository = "https://github.com/finnbear/rustrict/" @@ -38,6 +38,7 @@ censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normaliza context = ["censor", "strsim"] customize = ["censor"] width = ["lazy_static"] +pii = ["lazy_static", "regex"] find_false_positives = ["censor", "regex", "indicatif", "rayon"] find_replacements = ["csv"] trace = ["censor"] diff --git a/Makefile b/Makefile index 1dd5be0..5f70f36 100644 --- a/Makefile +++ b/Makefile @@ -19,14 +19,14 @@ widths: cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width test: - cargo test --release --features width,serde -- --nocapture + cargo test --release --features width,pii,serde -- --nocapture compare: COMPARE=1 make test # Skips accuracy analysis so finishes faster. test_debug: - cargo test + cargo test --features pii -- --nocapture fuzz: cargo fuzz run fuzz diff --git a/src/lib.rs b/src/lib.rs index 50a6de7..76b898d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,8 @@ pub(crate) mod typ; #[cfg(feature = "context")] pub(crate) mod context; +#[cfg(feature = "pii")] +mod pii; #[cfg(feature = "width")] pub(crate) mod width; @@ -57,6 +59,9 @@ pub use context::{ ContextRepetitionLimitOptions, }; +#[cfg(feature = "pii")] +pub use pii::censor_and_analyze_pii; + /// Trims whitespace characters from both ends of a string, according to the definition of /// `crate::is_whitespace`. pub fn trim_whitespace(s: &str) -> &str { diff --git a/src/pii.rs b/src/pii.rs new file mode 100644 index 0000000..f311e10 --- /dev/null +++ b/src/pii.rs @@ -0,0 +1,115 @@ +use lazy_static::lazy_static; +use regex::Regex; +use std::borrow::Cow; + +lazy_static! { + static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap(); + static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap(); + static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]*\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]*\s*(\.|dot)\s*[a-z]{2,3}"#).unwrap(); + static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ ?)+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap(); + static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap(); + static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]+\.[a-zA-Z]{2,3}"#).unwrap(); +} + +/// Returns [`s`] with personally-identifiable information censored out, and a `true` if +/// anything was censored. +/// - phone numbers +/// - physical addresses +/// - ip addresses +/// - email addresses +/// - self-described full names +/// - urls +pub fn censor_and_analyze_pii(s: &str) -> (String, bool) { + let ret = Cow::Borrowed(s); + let mut censored = false; + let ret = PHONE.replace_all(&ret, "***-****-****"); + censored |= matches!(ret, Cow::Owned(_)); + let ret = IP_ADDRESS.replace_all(&ret, "***.***.***.***"); + censored |= matches!(ret, Cow::Owned(_)); + let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***"); + censored |= matches!(ret, Cow::Owned(_)); + let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***"); + censored |= matches!(ret, Cow::Owned(_)); + let ret = NAME.replace_all(&ret, "name is ***** *****"); + censored |= matches!(ret, Cow::Owned(_)); + let ret = URL.replace_all(&ret, "******.***"); + censored |= matches!(ret, Cow::Owned(_)); + (ret.into_owned(), censored) +} + +#[cfg(test)] +mod tests { + use super::censor_and_analyze_pii; + + fn censor_pii(s: &str) -> String { + censor_and_analyze_pii(s).0 + } + + fn has_pii(s: &str) -> bool { + censor_and_analyze_pii(s).1 + } + + #[test] + fn pii() { + let pii = r#" + hello@gmail.com + hello f00 @ gmail.com + sus@yahoo.biz sus + foo[at]yahoo.com + foo [at] yahoo dot com + foo at yahoo dot com + foo @ twitch.tv + foo AT twitch.tv + 1234567890 + (123)4567890 + +1 1234567890 + +1 (123) 4567890 + +12 (123) 456 7890 + +1 (123) 456-7890 + +1 123-456-7890 + +1 123.456.7890 + 123.123.123.123 + 8.8.8.8 + 999.999.999.999 + 12345 SW 54th ST #150 + go to 1234 Main Street for free candy + my name is: ALEX Smith + my real name is Alex smith + his name is alex smith + her real name is alex Smith + my name is alex. smith + hello.com + http://hello.com + https://foo.com + bar.com + example.org + twitch.tv + http:/chat.dev + "#; + for line in pii.lines() { + if line.trim().is_empty() { + continue; + } + assert!(has_pii(line), "{line}"); + } + println!("{}", censor_pii(pii)); + } + + #[test] + fn not_pii() { + for line in include_str!("./safe.txt") + .lines() + .chain(include_str!("./false_positives.txt").lines()) + { + assert!(!has_pii(line), "{line}"); + } + } + + #[test] + fn censor_pii_test() { + assert_eq!( + censor_pii("mail me at foo@bar.com, bye"), + "mail me at ****@*****.***, bye" + ); + } +} diff --git a/src/replacements.csv b/src/replacements.csv index e8e64f6..546f800 100644 --- a/src/replacements.csv +++ b/src/replacements.csv @@ -108,7 +108,7 @@ z,sz Í,í Î,î Ï,ï -Ð,ð +Ð,Ddð Ñ,ñ Ò,ò Ó,ó diff --git a/src/replacements_extra.csv b/src/replacements_extra.csv index 2c60309..97420f4 100644 --- a/src/replacements_extra.csv +++ b/src/replacements_extra.csv @@ -1725,5 +1725,6 @@ B,Bb ẞ,B Ŋ,N º,o +Ð,D \u{0627},li \ No newline at end of file diff --git a/src/test_positive.txt b/src/test_positive.txt index d5458da..11453e2 100644 --- a/src/test_positive.txt +++ b/src/test_positive.txt @@ -2704,4 +2704,7 @@ fuhk you fyuk yo Àss sh¡t -r^pe \ No newline at end of file +r^pe +Ñig +ßich +se× \ No newline at end of file