Skip to content

Commit

Permalink
Experimental PII detection (0.7.19).
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Dec 10, 2023
1 parent c37227a commit 2a5de33
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 5 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustrict"
authors = ["Finn Bear"]
version = "0.7.18"
version = "0.7.19"
edition = "2021"
license = "MIT OR Apache-2.0"
repository = "https://github.com/finnbear/rustrict/"
Expand Down Expand Up @@ -38,6 +38,7 @@ censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normaliza
context = ["censor", "strsim"]
customize = ["censor"]
width = ["lazy_static"]
pii = ["lazy_static", "regex"]
find_false_positives = ["censor", "regex", "indicatif", "rayon"]
find_replacements = ["csv"]
trace = ["censor"]
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ widths:
cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width

test:
cargo test --release --features width,serde -- --nocapture
cargo test --release --features width,pii,serde -- --nocapture

compare:
COMPARE=1 make test

# Skips accuracy analysis so finishes faster.
test_debug:
cargo test
cargo test --features pii -- --nocapture

fuzz:
cargo fuzz run fuzz
Expand Down
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub(crate) mod typ;
#[cfg(feature = "context")]
pub(crate) mod context;

#[cfg(feature = "pii")]
mod pii;
#[cfg(feature = "width")]
pub(crate) mod width;

Expand Down Expand Up @@ -57,6 +59,9 @@ pub use context::{
ContextRepetitionLimitOptions,
};

#[cfg(feature = "pii")]
pub use pii::censor_and_analyze_pii;

/// Trims whitespace characters from both ends of a string, according to the definition of
/// `crate::is_whitespace`.
pub fn trim_whitespace(s: &str) -> &str {
Expand Down
115 changes: 115 additions & 0 deletions src/pii.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
use lazy_static::lazy_static;
use regex::Regex;
use std::borrow::Cow;

lazy_static! {
static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap();
static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap();
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]*\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]*\s*(\.|dot)\s*[a-z]{2,3}"#).unwrap();
static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ ?)+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap();
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap();
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]+\.[a-zA-Z]{2,3}"#).unwrap();
}

/// Returns [`s`] with personally-identifiable information censored out, and a `true` if
/// anything was censored.
/// - phone numbers
/// - physical addresses
/// - ip addresses
/// - email addresses
/// - self-described full names
/// - urls
pub fn censor_and_analyze_pii(s: &str) -> (String, bool) {
let ret = Cow::Borrowed(s);
let mut censored = false;
let ret = PHONE.replace_all(&ret, "***-****-****");
censored |= matches!(ret, Cow::Owned(_));
let ret = IP_ADDRESS.replace_all(&ret, "***.***.***.***");
censored |= matches!(ret, Cow::Owned(_));
let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***");
censored |= matches!(ret, Cow::Owned(_));
let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***");
censored |= matches!(ret, Cow::Owned(_));
let ret = NAME.replace_all(&ret, "name is ***** *****");
censored |= matches!(ret, Cow::Owned(_));
let ret = URL.replace_all(&ret, "******.***");
censored |= matches!(ret, Cow::Owned(_));
(ret.into_owned(), censored)
}

#[cfg(test)]
mod tests {
use super::censor_and_analyze_pii;

fn censor_pii(s: &str) -> String {
censor_and_analyze_pii(s).0
}

fn has_pii(s: &str) -> bool {
censor_and_analyze_pii(s).1
}

#[test]
fn pii() {
let pii = r#"
[email protected]
hello f00 @ gmail.com
[email protected] sus
foo[at]yahoo.com
foo [at] yahoo dot com
foo at yahoo dot com
foo @ twitch.tv
foo AT twitch.tv
1234567890
(123)4567890
+1 1234567890
+1 (123) 4567890
+12 (123) 456 7890
+1 (123) 456-7890
+1 123-456-7890
+1 123.456.7890
123.123.123.123
8.8.8.8
999.999.999.999
12345 SW 54th ST #150
go to 1234 Main Street for free candy
my name is: ALEX Smith
my real name is Alex smith
his name is alex smith
her real name is alex Smith
my name is alex. smith
hello.com
http://hello.com
https://foo.com
bar.com
example.org
twitch.tv
http:/chat.dev
"#;
for line in pii.lines() {
if line.trim().is_empty() {
continue;
}
assert!(has_pii(line), "{line}");
}
println!("{}", censor_pii(pii));
}

#[test]
fn not_pii() {
for line in include_str!("./safe.txt")
.lines()
.chain(include_str!("./false_positives.txt").lines())
{
assert!(!has_pii(line), "{line}");
}
}

#[test]
fn censor_pii_test() {
assert_eq!(
censor_pii("mail me at [email protected], bye"),
"mail me at ****@*****.***, bye"
);
}
}
2 changes: 1 addition & 1 deletion src/replacements.csv
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ z,sz
Í,í
Î,î
Ï,ï
Ð,ð
Ð,Ddð
Ñ,ñ
Ò,ò
Ó,ó
Expand Down
1 change: 1 addition & 0 deletions src/replacements_extra.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1725,5 +1725,6 @@ B,Bb
ẞ,B
Ŋ,N
º,o
Ð,D

\u{0627},li
5 changes: 4 additions & 1 deletion src/test_positive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2704,4 +2704,7 @@ fuhk you
fyuk yo
Àss
sh¡t
r^pe
r^pe
Ñig
ßich
se×

0 comments on commit 2a5de33

Please sign in to comment.