From 61b6cd721fa6e63765926cc0302802f32f1ea902 Mon Sep 17 00:00:00 2001 From: Finn Bear Date: Fri, 15 Mar 2024 13:44:31 -0700 Subject: [PATCH] Fix #23 - improve wordlists and release 0.7.23 --- Cargo.toml | 2 +- README.md | 2 +- src/false_positives.txt | 2 + src/profanity.csv | 5 + src/replacements.csv | 376 +++++++++++++++++++++++++++++++++++-- src/replacements_extra.csv | 373 +++++++++++++++++++++++++++++++++++- 6 files changed, 741 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1311350..b8ef889 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustrict" authors = ["Finn Bear"] -version = "0.7.22" +version = "0.7.23" edition = "2021" license = "MIT OR Apache-2.0" repository = "https://github.com/finnbear/rustrict/" diff --git a/README.md b/README.md index 5954874..7b157da 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected | Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time | |-------|----------|-------------------|-------------------|------| -| [rustrict](https://crates.io/crates/rustrict) | 79.81% | 93.99% | 76.27% | 8s | +| [rustrict](https://crates.io/crates/rustrict) | 79.81% | 93.99% | 76.27% | 9s | | [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s | ## Development diff --git a/src/false_positives.txt b/src/false_positives.txt index b75ec80..4b2f488 100644 --- a/src/false_positives.txt +++ b/src/false_positives.txt @@ -17641,6 +17641,8 @@ terrapene terrible peer terror ist terror list +terror organizational +terror organizations terroristic test esc test esp diff --git a/src/profanity.csv b/src/profanity.csv index dc40297..4089a8b 100644 --- a/src/profanity.csv +++ b/src/profanity.csv @@ -6,6 +6,7 @@ word,profane,offensive,sexual,mean,evasive ࿗,0,2,0,0,0 ࿘,0,2,0,0,0 ✠,0,1,0,0,0 +≬,0,0,2,0,1 🖕,1,0,0,0,1 🍆,0,0,1,0,1 🍆💦,1,0,3,0,1 @@ -854,6 +855,8 @@ idolfhatler,0,2,0,0,2 ifkdurmum,2,2,3,2,2 igger,3,3,0,0,0 i have ED,0,0,1,0,1 +i hope your father dies,0,1,0,2,1 +i hope your mother dies,0,1,0,2,1 ima negro,0,1,0,0,1 imbecil,0,0,0,2,0 impregnate,0,0,1,0,0 @@ -1676,6 +1679,7 @@ teabaging,0,0,1,0,0 teats,0,0,1,0,0 teets,0,0,1,0,0 terrorist,0,1,0,0,0 +terror organization,0,1,0,0,0 testes,0,0,2,0,1 testical,0,0,3,0,1 testicle,0,0,3,0,0 @@ -1810,6 +1814,7 @@ weiner,0,0,1,0,0 welcher,0,0,0,1,0 wench,0,0,1,0,0 weirdo,0,0,0,1,0 +wermacht,0,1,0,0,0 wetback,0,1,0,0,0 wetdream,0,0,1,0,0 whank,0,0,3,0,1 diff --git a/src/replacements.csv b/src/replacements.csv index c59a1df..b995099 100644 --- a/src/replacements.csv +++ b/src/replacements.csv @@ -2332,31 +2332,375 @@ z,sz ⅽ,cⅽ ⅾ,dⅾ Ↄ,ↄ -∀,a -∃,ab -∆,a -∈,e +∀,AVav +∁,Cc +∂,e +∃,Eabe +∅,o +∆,Aa +∈,CEce +∊,Ece +∋,Ee +∍,Ee +∎,o +∏,Nn +∐,Uu +∑,Ee +∔,t +∕,il +∖,il +∗,o +∘,o √,v +∛,v +∜,v +∝,co +∞,o +∟,Ll +∠,c +∡,Cc +∢,Cc ∣,il∣ ∥,il -∨,v∨ +∧,n +∨,uv∨ ∩,n -∪,u∪ -∫,l +∪,uv∪ +∫,fil +∬,fil +∭,fil +∮,fil +∯,fil +∰,fil +∱,fil +∲,fil +∳,fil +∴,n +∵,uv +∶,il +∺,Hh +∻,t +∾,Nn +∿,Nn +≀,l +≅,Ee +≊,Ee +≋,Ee +≌,Ee +≎,o +≏,o +≑,t +≖,o +≗,o +≘,n +≙,Aan +≚,uv +≜,Aan +≝,def +≞,m +≡,Ee +≣,Ee +≤,Ece +≥,Ece +≦,Ece +≧,Ece +≨,Ece +≩,Ece +≪,c +≫,c +≬,o +≲,c +≳,c +≶,c +≷,c +≺,c +≻,c +≼,Ece +≽,Ece +≾,Ece +≿,Ece ⊂,c +⊃,c ⊆,c -⊕,o +⊇,Ece +⊊,c +⊋,c +⊌,Uu +⊍,Uu +⊎,Uu +⊏,c +⊐,c +⊑,c +⊒,c +⊓,Nn +⊔,Uu +⊕,ot ⊖,o -⊗,o -⊤,t⊤ -⊥,t -⋁,v⋁ -⋃,u⋃ -⋿,e⋿ -⍳,i⍳ +⊗,ox +⊘,o +⊙,o +⊚,o +⊛,o +⊜,o +⊝,o +⊞,ot +⊟,o +⊠,ox +⊡,o +⊢,ilt +⊣,ilt +⊤,Tilt⊤ +⊥,Tilt +⊦,il +⊧,cil +⊨,c +⊩,Hh +⊪,Hh +⊫,Hch +⊰,c +⊱,Ee +⊲,c +⊳,c +⊴,ce +⊵,ce +⊶,o +⊷,o +⊸,o +⊹,t +⊺,Tilt +⊻,v +⊼,n +⊽,v +⊾,o +⊿,o +⋀,Nn +⋁,UVuv⋁ +⋂,Nn +⋃,Uu⋃ +⋄,o +⋆,o +⋇,OXox +⋈,Xx +⋉,CXcx +⋊,Xx +⋋,n +⋌,Ccn +⋎,uv +⋏,n +⋐,c +⋑,c +⋒,n +⋓,u +⋔,hn +⋕,Hh +⋖,c +⋗,c +⋘,c +⋙,c +⋚,ESces +⋛,Ece +⋜,Ece +⋝,Ece +⋞,Ece +⋤,Ece +⋥,Ece +⋦,Ece +⋧,Ece +⋨,Ece +⋩,Ece +⋮,il +⋲,Ee +⋳,Ee +⋴,Ee +⋵,Ee +⋶,Ee +⋷,Ee +⋸,Ee +⋹,Ee +⋺,Ee +⋻,Ee +⋼,Ee +⋽,Ee +⋾,Ee +⋿,Ee⋿ +⌀,Oo +⌂,Aa +⌃,n +⌄,v +⌅,n +⌆,n +⌇,l +⌈,il +⌉,il +⌊,il +⌋,il +⌐,r +⌑,Oo +⌒,n +⌓,no +⌕,o +⌖,o +⌗,Hh +⌘,Hh +⌫,Xx +⌭,Hho +⌮,o +⌴,Uu +⌵,UVuv +⌶,Ii +⌹,t +⌺,o +⌻,o +⌼,Oo +⌽,LOlo +⌾,Oo +⌿,Llt +⍃,c +⍄,c +⍅,lt +⍆,lt +⍉,Oo +⍊,il +⍋,Aa +⍌,Vv +⍍,Aa +⍎,lo +⍏,t +⍐,ilt +⍑,Tt +⍒,Aal +⍓,Aan +⍔,Vv +⍕,Tt +⍖,t +⍗,il +⍘,i +⍙,Aao +⍚,o +⍛,o +⍜,Oo +⍝,ANan +⍟,Oo +⍡,Tt +⍢,o +⍤,o +⍥,Oo +⍦,w +⍧,Ccil +⍫,Aa +⍬,Oo +⍭,Llt +⍱,AVav +⍲,Aan +⍳,il⍳ ⍴,p⍴ +⍵,w +⍶,a +⍷,Ee +⍸,il +⍹,w ⍺,a⍺ -⏽,l⏽ +⍼,Ll +⍽,u +⍾,n +⍿,lo +⎀,av +⎁,a +⎂,a +⎃,a +⎄,DOdo +⎅,o +⎈,o +⎉,o +⎊,o +⎋,o +⎍,n +⎎,f +⎏,o +⎐,o +⎑,o +⎒,p +⎔,Oo +⎕,Oo +⎛,il +⎜,il +⎝,il +⎞,il +⎟,il +⎠,ijl +⎡,il +⎢,il +⎣,il +⎤,il +⎥,il +⎦,ijl +⎧,il +⎨,il +⎩,il +⎪,il +⎫,il +⎬,il +⎭,ijl +⎮,il +⎰,fil +⎱,il +⎲,c +⎳,c +⎴,n +⎵,u +⎶,h +⎷,v +⎸,il +⎹,il +⎾,il +⎿,Lil +⏀,IOio +⏁,OTot +⏂,Oo +⏃,AIail +⏄,ATat +⏅,Aa +⏆,Iilt +⏇,FTft +⏈,t +⏉,Tt +⏊,Lil +⏋,Lil +⏌,Jijl +⏍,Oo +⏏,Aa +⏐,il +⏑,u +⏒,ou +⏓,u +⏔,w +⏕,w +⏖,w +⏘,u +⏙,w +⏜,n +⏝,u +⏞,mn +⏟,uw +⏠,n +⏡,u +⏣,Oo +⏥,Oo +⏰,Oo +⏱,Oo +⏲,Oo +⏳,Hh +⏴,Cc +⏵,Dd +⏶,n +⏷,u +⏸,ILil +⏺,Oo +⏻,Oilo +⏼,Oilo +⏽,ILil⏽ +⏾,Cc +⏿,COco ⒜,a ⒝,b ⒞,c diff --git a/src/replacements_extra.csv b/src/replacements_extra.csv index 77ed4eb..95ab907 100644 --- a/src/replacements_extra.csv +++ b/src/replacements_extra.csv @@ -2790,4 +2790,375 @@ B,Bb ᶾ,e ᶿ,eo -\u{0627},li \ No newline at end of file +∀,VA +∁,C +∂,e +∃,E +∅,o +∆,A +∈,CE +∊,cE +∋,E +∍,E +∎,o +∏,N +∐,U +∑,E +∔,t +∕,li +∖,li +∗,o +∘,o +√,v +∛,v +∜,v +∝,co +∞,o +∟,L +∠,c +∡,C +∢,C +∣,li +∥,li +∧,n +∨,vu +∩,n +∪,uv +∫,fli +∬,fli +∭,fli +∮,fli +∯,fli +∰,fli +∱,fli +∲,fli +∳,fli +∴,n +∵,vu +∶,li +∺,H +∻,t +∾,N +∿,N +≀,l +≅,E +≊,E +≋,E +≌,E +≎,o +≏,o +≑,t +≖,o +≗,o +≘,n +≙,nA +≚,vu +≜,nA +≝,def +≞,m +≡,E +≣,E +≤,cE +≥,cE +≦,cE +≧,cE +≨,cE +≩,cE +≪,c +≫,c +≬,o +≲,c +≳,c +≶,c +≷,c +≺,c +≻,c +≼,cE +≽,cE +≾,cE +≿,cE +⊂,c +⊃,c +⊆,c +⊇,cE +⊊,c +⊋,c +⊌,U +⊍,U +⊎,U +⊏,c +⊐,c +⊑,c +⊒,c +⊓,N +⊔,U +⊕,ot +⊖,o +⊗,ox +⊘,o +⊙,o +⊚,o +⊛,o +⊜,o +⊝,o +⊞,ot +⊟,o +⊠,ox +⊡,o +⊢,tli +⊣,tli +⊤,Tli +⊥,Tli +⊦,li +⊧,lic +⊨,c +⊩,H +⊪,H +⊫,Hc +⊰,c +⊱,E +⊲,c +⊳,c +⊴,ce +⊵,ce +⊶,o +⊷,o +⊸,o +⊹,t +⊺,Til +⊻,v +⊼,n +⊽,v +⊾,o +⊿,o +⋀,N +⋁,VU +⋂,N +⋃,U +⋄,o +⋆,o +⋇,XO +⋈,X +⋉,CX +⋊,X +⋋,n +⋌,Cn +⋎,vu +⋏,n +⋐,c +⋑,c +⋒,n +⋓,u +⋔,hn +⋕,H +⋖,c +⋗,c +⋘,c +⋙,c +⋚,EcS +⋛,Ec +⋜,Ec +⋝,Ec +⋞,Ec +⋤,Ec +⋥,Ec +⋦,Ec +⋧,Ec +⋨,Ec +⋩,Ec +⋮,li +⋲,E +⋳,E +⋴,E +⋵,E +⋶,E +⋷,E +⋸,E +⋹,E +⋺,E +⋻,E +⋼,E +⋽,E +⋾,E +⋿,E + +⌀,O +⌂,A +⌃,n +⌄,v +⌅,n +⌆,n +⌇,l +⌈,li +⌉,li +⌊,li +⌋,li +⌐,r +⌑,O +⌒,n +⌓,no +⌕,o +⌖,o +⌗,H +⌘,H +⌫,X +⌭,Ho +⌮,o +⌴,U +⌵,VU +⌶,I +⌹,t +⌺,o +⌻,o +⌼,O +⌽,OL +⌾,O +⌿,Lt +⍃,c +⍄,c +⍅,lt +⍆,lt +⍉,O +⍊,li +⍋,A +⍌,V +⍍,A +⍎,ol +⍏,t +⍐,tli +⍑,T +⍒,Al +⍓,An +⍔,V +⍕,T +⍖,t +⍗,il +⍘,i +⍙,Ao +⍚,o +⍛,o +⍜,O +⍝,NA +⍟,O +⍡,T +⍢,o +⍤,o +⍥,O +⍦,w +⍧,Cil +⍫,A +⍬,O +⍭,Lt +⍱,AV +⍲,An +⍳,il +⍴,p +⍵,w +⍶,a +⍷,E +⍸,li +⍹,w +⍺,a +⍼,L +⍽,u +⍾,n +⍿,lo +⎀,av +⎁,a +⎂,a +⎃,a +⎄,DO +⎅,o +⎈,o +⎉,o +⎊,o +⎋,o +⎍,n +⎎,f +⎏,o +⎐,o +⎑,o +⎒,p +⎔,O +⎕,O +⎛,li +⎜,li +⎝,li +⎞,li +⎟,li +⎠,lij +⎡,li +⎢,li +⎣,li +⎤,li +⎥,li +⎦,lij +⎧,li +⎨,li +⎩,li +⎪,li +⎫,li +⎬,li +⎭,lij +⎮,li +⎰,lif +⎱,li +⎲,c +⎳,c +⎴,n +⎵,u +⎶,h +⎷,v +⎸,li +⎹,li +⎾,li +⎿,Li +⏀,OI +⏁,TO +⏂,O +⏃,AIl +⏄,AT +⏅,A +⏆,tIl +⏇,TF +⏈,t +⏉,T +⏊,Li +⏋,iL +⏌,ilJ +⏍,O +⏏,A +⏐,li +⏑,u +⏒,uo +⏓,u +⏔,w +⏕,w +⏖,w +⏘,u +⏙,w +⏜,n +⏝,u +⏞,mn +⏟,wu +⏠,n +⏡,u +⏣,O +⏥,O +⏰,O +⏱,O +⏲,O +⏳,H +⏴,C +⏵,D +⏶,n +⏷,u +⏸,LI +⏺,O +⏻,Oli +⏼,Oli +⏽,LI +⏾,C +⏿,CO + +\u{0627},li