-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Experimental PII detection (0.7.19).
- Loading branch information
Showing
7 changed files
with
130 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
use lazy_static::lazy_static; | ||
use regex::Regex; | ||
use std::borrow::Cow; | ||
|
||
lazy_static! { | ||
static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap(); | ||
static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap(); | ||
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]*\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]*\s*(\.|dot)\s*[a-z]{2,3}"#).unwrap(); | ||
static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ ?)+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap(); | ||
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap(); | ||
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]+\.[a-zA-Z]{2,3}"#).unwrap(); | ||
} | ||
|
||
/// Returns [`s`] with personally-identifiable information censored out, and a `true` if | ||
/// anything was censored. | ||
/// - phone numbers | ||
/// - physical addresses | ||
/// - ip addresses | ||
/// - email addresses | ||
/// - self-described full names | ||
/// - urls | ||
pub fn censor_and_analyze_pii(s: &str) -> (String, bool) { | ||
let ret = Cow::Borrowed(s); | ||
let mut censored = false; | ||
let ret = PHONE.replace_all(&ret, "***-****-****"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = IP_ADDRESS.replace_all(&ret, "***.***.***.***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = NAME.replace_all(&ret, "name is ***** *****"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = URL.replace_all(&ret, "******.***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
(ret.into_owned(), censored) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::censor_and_analyze_pii; | ||
|
||
fn censor_pii(s: &str) -> String { | ||
censor_and_analyze_pii(s).0 | ||
} | ||
|
||
fn has_pii(s: &str) -> bool { | ||
censor_and_analyze_pii(s).1 | ||
} | ||
|
||
#[test] | ||
fn pii() { | ||
let pii = r#" | ||
[email protected] | ||
hello f00 @ gmail.com | ||
[email protected] sus | ||
foo[at]yahoo.com | ||
foo [at] yahoo dot com | ||
foo at yahoo dot com | ||
foo @ twitch.tv | ||
foo AT twitch.tv | ||
1234567890 | ||
(123)4567890 | ||
+1 1234567890 | ||
+1 (123) 4567890 | ||
+12 (123) 456 7890 | ||
+1 (123) 456-7890 | ||
+1 123-456-7890 | ||
+1 123.456.7890 | ||
123.123.123.123 | ||
8.8.8.8 | ||
999.999.999.999 | ||
12345 SW 54th ST #150 | ||
go to 1234 Main Street for free candy | ||
my name is: ALEX Smith | ||
my real name is Alex smith | ||
his name is alex smith | ||
her real name is alex Smith | ||
my name is alex. smith | ||
hello.com | ||
http://hello.com | ||
https://foo.com | ||
bar.com | ||
example.org | ||
twitch.tv | ||
http:/chat.dev | ||
"#; | ||
for line in pii.lines() { | ||
if line.trim().is_empty() { | ||
continue; | ||
} | ||
assert!(has_pii(line), "{line}"); | ||
} | ||
println!("{}", censor_pii(pii)); | ||
} | ||
|
||
#[test] | ||
fn not_pii() { | ||
for line in include_str!("./safe.txt") | ||
.lines() | ||
.chain(include_str!("./false_positives.txt").lines()) | ||
{ | ||
assert!(!has_pii(line), "{line}"); | ||
} | ||
} | ||
|
||
#[test] | ||
fn censor_pii_test() { | ||
assert_eq!( | ||
censor_pii("mail me at [email protected], bye"), | ||
"mail me at ****@*****.***, bye" | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,7 +108,7 @@ z,sz | |
Í,í | ||
Î,î | ||
Ï,ï | ||
Ð,ð | ||
Ð,Ddð | ||
Ñ,ñ | ||
Ò,ò | ||
Ó,ó | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1725,5 +1725,6 @@ B,Bb | |
ẞ,B | ||
Ŋ,N | ||
º,o | ||
Ð,D | ||
|
||
\u{0627},li |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2704,4 +2704,7 @@ fuhk you | |
fyuk yo | ||
Àss | ||
sh¡t | ||
r^pe | ||
r^pe | ||
Ñig | ||
ßich | ||
se× |