Skip to content

Commit

Permalink
Directional coom (#264)
Browse files Browse the repository at this point in the history
* Adding the directional coocurrence matrix.
* Added tests
  • Loading branch information
atantos authored Jan 9, 2024
1 parent 0b5992c commit ce69b59
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 54 deletions.
108 changes: 65 additions & 43 deletions src/coom.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""
coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool)
coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool, mode::Symbol)
Basic low-level function that calculates the co-occurrence matrix of a document.
Returns a sparse co-occurrence matrix sized `n × n` where `n = length(vocab)`
with elements of type `T`. The document `doc` is represented by a vector of its
terms (in order)`. The keywords `window` and `normalize` indicate the size of the
sliding word window in which co-occurrences are counted and whether to normalize
of not the counts by the distance between word positions.
of not the counts by the distance between word positions. The `mode` keyword can be either `:default` or `:directional` and indicates whether the co-occurrence matrix should be directional or not. This means that if `mode` is `:directional` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc`. If `mode` is `:default` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be twice the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc` (once for each direction, from i to j + from j to i).
# Example
```
Expand All @@ -30,28 +30,48 @@ julia> using TextAnalysis, DataStructures
[1, 2] = 2.0
[3, 2] = 0.3999
[2, 3] = 0.3999
julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)
3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 1.0
[1, 2] = 1.0
[3, 2] = 0.1999
[2, 3] = 0.1999
```
"""
function coo_matrix(::Type{T},
doc::Vector{<:AbstractString},
vocab::OrderedDict{<:AbstractString, Int},
window::Int,
normalize::Bool=true) where T<:AbstractFloat
doc::Vector{<:AbstractString},
vocab::OrderedDict{<:AbstractString,
Int},
window::Int,
normalize::Bool=true,
mode::Symbol=:default) where {T<:AbstractFloat}
# Initializations
n = length(vocab)
m = length(doc)
coom = spzeros(T, n, n)
# Count co-occurrences
for (i, token) in enumerate(doc)
inner_range = if mode == :directional
i:min(m, i + window)
else
max(1, i - window):min(m, i + window)
end
row = get(vocab, token, nothing)
isnothing(row) && continue

@inbounds for j in max(1, i - window):min(m, i + window)
# looking forward
@inbounds for j in inner_range
i == j && continue

wtoken = doc[j]
col = get(vocab, wtoken, nothing)
isnothing(col) && continue

nm = T(ifelse(normalize, abs(i - j), 1))
coom[row, col] += one(T) / nm
coom[col, row] = coom[row, col]
Expand All @@ -60,9 +80,9 @@ function coo_matrix(::Type{T},
return coom
end

coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString, Int},
window::Int, normalize::Bool=true) where T<:AbstractFloat =
coo_matrix(T, doc, OrderedDict(vocab), window, normalize)
coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString,Int},
window::Int, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
coo_matrix(T, doc, OrderedDict(vocab), window, normalize, mode)

"""
Basic Co-occurrence Matrix (COOM) type.
Expand All @@ -75,9 +95,9 @@ the document or corpus
columns of the co-occurrence matrix
"""
struct CooMatrix{T}
coom::SparseMatrixCSC{T, Int}
coom::SparseMatrixCSC{T,Int}
terms::Vector{String}
column_indices::OrderedDict{String, Int}
column_indices::OrderedDict{String,Int}
end


Expand All @@ -91,66 +111,68 @@ can be a `Vector{String}`, an `AbstractDict` where the keys are the lexicon,
or can be omitted, in which case the `lexicon` field of the corpus is used.
"""
function CooMatrix{T}(crps::Corpus,
terms::Vector{String};
window::Int=5,
normalize::Bool=true) where T<:AbstractFloat
terms::Vector{String};
window::Int=5,
normalize::Bool=true,
mode::Symbol=:default) where {T<:AbstractFloat}
column_indices = OrderedDict(columnindices(terms))
n = length(terms)
coom = spzeros(T, n, n)
for doc in crps
coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize)
coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize, mode)
end
return CooMatrix{T}(coom, terms, column_indices)
end

CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true) =
CooMatrix{Float64}(crps, terms, window=window, normalize=normalize)
CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
CooMatrix{Float64}(crps, terms, window=window, normalize=normalize, mode=mode)

CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true
) where T<:AbstractFloat =
CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize)
CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize, mode=mode)

CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true) =
CooMatrix{Float64}(crps, lex, window=window, normalize=normalize)
CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
CooMatrix{Float64}(crps, lex, window=window, normalize=normalize, mode=mode)

CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true) where T<:AbstractFloat = begin
CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = begin
isempty(lexicon(crps)) && update_lexicon!(crps)
CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize)
CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode)
end

CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true) = begin
CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = begin
isempty(lexicon(crps)) && update_lexicon!(crps)
CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize)
CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode)
end

# Document methods
function CooMatrix{T}(doc::AbstractDocument,
terms::Vector{String};
window::Int=5,
normalize::Bool=true) where T<:AbstractFloat
terms::Vector{String};
window::Int=5,
normalize::Bool=true,
mode::Symbol=:default) where {T<:AbstractFloat}
# Initializations
column_indices = OrderedDict(columnindices(terms))
coom = coo_matrix(T, tokens(doc), column_indices, window, normalize)
coom = coo_matrix(T, tokens(doc), column_indices, window, normalize, mode)
return CooMatrix{T}(coom, terms, column_indices)
end

function CooMatrix{T}(doc::NGramDocument,
terms::Vector{String};
window::Int=5,
normalize::Bool=true) where T <: AbstractFloat
terms::Vector{String};
window::Int=5,
normalize::Bool=true,
mode::Symbol=:default) where {T<:AbstractFloat}
error("The Co occurrence matrix of an NGramDocument can't be created.")
end

CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true) =
CooMatrix{Float64}(doc, terms, window=window, normalize=normalize)
CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
CooMatrix{Float64}(doc, terms, window=window, normalize=normalize, mode=mode)

function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true) where T<:AbstractFloat
function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat}
terms = unique(String.(tokens(doc)))
CooMatrix{T}(doc, terms, window=window, normalize=normalize)
CooMatrix{T}(doc, terms, window=window, normalize=normalize, mode=mode)
end

CooMatrix(doc; window::Int=5, normalize::Bool=true) =
CooMatrix{Float64}(doc, window=window, normalize=normalize)
CooMatrix(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
CooMatrix{Float64}(doc, window=window, normalize=normalize, mode=mode)

"""
coom(c::CooMatrix)
Expand All @@ -167,5 +189,5 @@ with the `entity`. The `CooMatrix{T}` will first have to
be created in order for the actual matrix to be accessed.
"""
coom(entity, eltype::Type{T}=Float;
window::Int=5, normalize::Bool=true) where T<:AbstractFloat =
coom(CooMatrix{T}(entity, window=window, normalize=normalize))
window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
coom(CooMatrix{T}(entity, window=window, normalize=normalize, mode=mode))
62 changes: 51 additions & 11 deletions test/coom.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
@testset "COOM (Co-occurence Matrix)" begin
doc_raw = StringDocument("This is a document. It has two sentences.")
prepare!(doc_raw, strip_punctuation|strip_whitespace|strip_case)
prepare!(doc_raw, strip_punctuation | strip_whitespace | strip_case)
doc = text(doc_raw)
sd = StringDocument(doc)
td = TokenDocument(doc)
nd = NGramDocument(doc)
crps = Corpus([sd, td])
T = Float16
# Results for window = 5, all terms in document used
expected_result = [ # for window == 5
# expected_result_C is the expected matrix for the normalized and default mode case.
expected_result_C = [ # for window == 5
0.0 2.0 1.0 2/3 0.5 0.4 0.0 0.0
2.0 0.0 2.0 1.0 2/3 0.5 0.4 0.0
1.0 2.0 0.0 2.0 1.0 2/3 0.5 0.4
Expand All @@ -17,14 +18,28 @@
0.4 0.5 2/3 1.0 2.0 0.0 2.0 1.0
0.0 0.4 0.5 2/3 1.0 2.0 0.0 2.0
0.0 0.0 0.4 0.5 2/3 1.0 2.0 0.0]

# expected_result_D is the expected matrix for the normalized and directional mode case.
expected_result_D = [ # for window == 5
0.0 1.0 0.5 1/3 0.25 0.2 0.0 0.0
1.0 0.0 1.0 0.5 1/3 0.25 0.2 0.0
0.5 1.0 0.0 1.0 0.5 1/3 0.25 0.2
1/3 0.5 1.0 0.0 1.0 0.5 1/3 0.25
0.25 1/3 0.5 1.0 0.0 1.0 0.5 1/3
0.2 0.25 1/3 0.5 1.0 0.0 1.0 0.5
0.0 0.2 0.25 1/3 0.5 1.0 0.0 1.0
0.0 0.0 0.2 0.25 1/3 0.5 1.0 0.0]
# Verify untyped constructor
terms = tokens(td)
for d in [sd, td, crps]
C = TextAnalysis.CooMatrix(d, terms)
D = TextAnalysis.CooMatrix(d, terms, mode=:directional)
if !(d isa Corpus)
@test TextAnalysis.coom(C) == expected_result
@test TextAnalysis.coom(C) == expected_result_C
@test TextAnalysis.coom(D) == expected_result_D
else
@test TextAnalysis.coom(C) == length(crps) * expected_result
@test TextAnalysis.coom(C) == length(crps) * expected_result_C
@test TextAnalysis.coom(D) == length(crps) * expected_result_D
end
end
@test_throws ErrorException TextAnalysis.CooMatrix(nd)
Expand All @@ -33,28 +48,53 @@
terms = tokens(td)
for d in [sd, td, crps]
C = TextAnalysis.CooMatrix{T}(d, terms)
D = TextAnalysis.CooMatrix{T}(d, terms, mode=:directional)
@test C isa TextAnalysis.CooMatrix{T}
if !(d isa Corpus)
@test TextAnalysis.coom(C) == T.(expected_result)
@test TextAnalysis.coom(C) == T.(expected_result_C)
@test TextAnalysis.coom(D) == T.(expected_result_D)
else
@test TextAnalysis.coom(C) == length(crps) * T.(expected_result)
@test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
@test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
end
end
@test_throws ErrorException TextAnalysis.CooMatrix{T}(nd)

# Results for window = 1, custom terms
terms = ["this", "document", "it"]
expected_result = [0.0 0.0 0.0; # document
0.0 0.0 2.0; # it
0.0 2.0 0.0] # this
expected_result_C = [0.0 0.0 0.0; # document
0.0 0.0 2.0; # it
0.0 2.0 0.0] # this

expected_result_D = [0.0 0.0 0.0; # document
0.0 0.0 1.0; # it
0.0 1.0 0.0] # this

# Verify untyped constructor
for d in [sd, td, crps]
C = TextAnalysis.CooMatrix(d, terms, window=1)
D = TextAnalysis.CooMatrix(d, terms, window=1, mode=:directional)
if !(d isa Corpus)
@test TextAnalysis.coom(C) == T.(expected_result_C)
@test TextAnalysis.coom(D) == T.(expected_result_D)
else
@test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
@test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
end
end
@test_throws ErrorException TextAnalysis.CooMatrix(nd)

# Verify typed constructor
for d in [sd, td, crps]
C = TextAnalysis.CooMatrix{T}(d, terms, window=1)
D = TextAnalysis.CooMatrix{T}(d, terms, window=1, mode=:directional)
@test C isa TextAnalysis.CooMatrix{T}
if !(d isa Corpus)
@test TextAnalysis.coom(C) == T.(expected_result)
@test TextAnalysis.coom(C) == T.(expected_result_C)
@test TextAnalysis.coom(D) == T.(expected_result_D)
else
@test TextAnalysis.coom(C) == length(crps) * T.(expected_result)
@test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
@test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
end
end
@test_throws ErrorException TextAnalysis.CooMatrix{T}(nd)
Expand Down

0 comments on commit ce69b59

Please sign in to comment.