Skip to content

Commit

Permalink
Merge pull request #552 from sul-dlss/revert-548-cocina
Browse files Browse the repository at this point in the history
Revert "Use cocina instead of public xml to drive content search indexing."
  • Loading branch information
cbeer authored Aug 7, 2024
2 parents 312ba34 + 18ec530 commit c8adfd3
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 88 deletions.
40 changes: 16 additions & 24 deletions app/models/purl_object.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ def initialize(druid)
@druid = druid
end

def resources
public_xml.xpath('//contentMetadata/resource')
end

def ocr_files
return to_enum(:ocr_files) unless block_given?

resource_files.each do |file, file_set|
next unless file['use'] == 'transcription'
next unless file['hasMimeType'].in?(['application/xml', 'application/alto+xml', 'text/plain'])
next unless file['size'].to_i <= Settings.maximum_ocr_filesize_to_consider

yield PurlObject::File.new(druid, file, file_set.except('structural'))
resources.each do |r|
r.xpath('file[@role="transcription"][@mimetype="application/xml" or @mimetype="application/alto+xml" or @mimetype="text/plain"]').each do |file|
yield file unless file['size'].to_i > Settings.maximum_ocr_filesize_to_consider
end
end
end

Expand All @@ -35,14 +37,16 @@ def to_solr(options = { in_threads: 8 })
# Inject "bookkeeping" document into index first to record last published date
yield({ id: druid, druid: druid, published: published, resource_id: 'druid' })

results = Parallel.map(ocr_files, options, &:to_solr)
results = Parallel.map(ocr_files, options) do |file|
PurlObject::File.new(druid, file).to_solr
end

# preserving the stream-like API for now..
results.each { |r| yield r unless r.nil? }
end

def published
public_cocina['modified']
public_xml.root['published']
end

private
Expand All @@ -51,23 +55,11 @@ def fetch(url)
self.class.client.get(url).body.to_s
end

def public_cocina
@public_cocina ||= JSON.parse(public_cocina_body)
def public_xml
@public_xml ||= Nokogiri::XML.parse(public_xml_body)
end

def public_cocina_body
fetch(format(Settings.purl.public_cocina_url, druid: druid))
end

def resource_files
return to_enum(:resource_files) unless block_given?

public_cocina.dig('structural', 'contains')&.each do |file_set|
file_set.dig('structural', 'contains').each do |file|
next unless file.dig('administrative', 'shelve')

yield file, file_set
end
end
def public_xml_body
fetch(format(Settings.purl.public_xml_url, druid: druid))
end
end
13 changes: 6 additions & 7 deletions app/models/purl_object/file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,27 @@
class PurlObject
# File object within a PURL document
class File
attr_reader :druid, :file_metadata, :fileset_metadata
attr_reader :druid, :file_xml_fragment

def self.client
Thread.current[:client] ||= HTTP.persistent(Settings.stacks.host)
end

def initialize(druid, file_metadata, fileset_metadata = {})
def initialize(druid, file_xml_fragment)
@druid = druid
@file_metadata = file_metadata
@fileset_metadata = fileset_metadata
@file_xml_fragment = file_xml_fragment
end

def resource_id
fileset_metadata['externalIdentifier']&.sub('https://cocina.sul.stanford.edu/fileSet/', 'cocina-fileSet-')
file_xml_fragment.xpath('..').first.attr('id')
end

def filename
file_metadata['filename']
file_xml_fragment.attr('id')
end

def mimetype
file_metadata['hasMimeType']
file_xml_fragment.attr('mimetype')
end

def file_url
Expand Down
3 changes: 1 addition & 2 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ maximum_ocr_filesize_to_consider: 10000000 # 10MB
purl:
canvas_url: 'https://purl.stanford.edu/%{druid}/iiif/canvas/%{resource}'
public_xml_url: 'https://purl.stanford.edu/%{druid}.xml'
public_cocina_url: 'https://purl.stanford.edu/%{druid}.json'

stacks:
host: 'https://stacks.stanford.edu'
Expand All @@ -23,4 +22,4 @@ solr:

kafka:
topic: testing_topic # Can be purl_fetcher_stage or purl_fetcher_prod
group_id: content-search
group_id: content-search
8 changes: 4 additions & 4 deletions spec/models/purl_object/file_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
require 'rails_helper'

RSpec.describe PurlObject::File do
subject(:purl_file) { described_class.new('somedruid', file_metadata, fileset_metadata) }
subject(:purl_file) { described_class.new('somedruid', file_xml_fragment) }

let(:file_metadata) { { filename: '92280263.xml', hasMimeType: 'application/xml', size: 46424 }.with_indifferent_access }
let(:fileset_metadata) { { externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/abc123' }.with_indifferent_access }
let(:file_xml_fragment) { Nokogiri::XML.parse(xml).root }
let(:xml) { '<file id="92280263.xml" mimetype="application/xml" size="46424"></file>' }

describe '#file_url' do
context 'with a file with a space' do
let(:file_metadata) { { filename: 'Read Me', hasMimeType: 'application/xml', size: 46424 }.with_indifferent_access }
let(:xml) { '<file id="Read Me" mimetype="application/xml" size="46424"></file>' }

it 'URI escapes the file name' do
expect(purl_file.file_url).to eq 'https://stacks.stanford.edu/file/somedruid/Read+Me'
Expand Down
79 changes: 29 additions & 50 deletions spec/models/purl_object_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,25 @@
RSpec.describe PurlObject do
subject(:object) { described_class.new('x') }

let(:public_cocina_json) do
{
modified: '2021-05-26T23:52:08Z',
structural: {
contains: [
{
externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/x-y',
structural: {
contains: [
{
filename: 'y.txt',
hasMimeType: 'text/plain',
use: 'transcription',
administrative: {
shelve: true
}
}
]
}
},
{
externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/x-oversize',
structural: {
contains: [
{
filename: 'oversize.txt',
size: 100.gigabytes,
hasMimeType: 'text/plain',
use: 'transcription',
administrative: {
shelve: true
}
}
]
}
}
]
}
}.to_json
let(:public_xml) do
<<-XML
<publicObject published="2021-05-26T23:52:08Z">
<contentMetadata>
<resource id="y">
<file id="y.txt" mimetype="text/plain" role="transcription" />
</resource>
<resource id="oversize">
<file id="oversize.txt" size="#{100.gigabytes}" mimetype="text/plain" />
</resource>
</contentMetadata>
</publicObject>
XML
end

let(:public_cocina_response) { instance_double(HTTP::Response, body: public_cocina_json) }
let(:public_xml_response) { instance_double(HTTP::Response, body: public_xml) }

before do
purl_url = 'https://purl.stanford.edu/x.json'
allow(described_class.client).to receive(:get).with(purl_url).and_return(public_cocina_response)
purl_url = 'https://purl.stanford.edu/x.xml'
allow(described_class.client).to receive(:get).with(purl_url).and_return(public_xml_response)
end

describe '#published' do
Expand All @@ -59,13 +32,19 @@
end
end

describe '#resources' do
it 'extracts resources from the contentMetadata' do
expect(object.resources.map { |file| file['id'] }).to match_array %w[y oversize]
end
end

describe '#ocr_files' do
it 'has resources that are potentially OCR' do
expect(object.ocr_files.map(&:filename)).to include 'y.txt'
expect(object.ocr_files.map { |file| file['id'] }).to include 'y.txt'
end

it 'excludes resources that are unlikely to be OCR' do
expect(object.ocr_files.map(&:filename)).not_to include 'oversize.txt'
expect(object.ocr_files.map { |file| file['id'] }).not_to include 'oversize.txt'
end
end

Expand All @@ -82,11 +61,11 @@
end

it 'creates an indexable hash of OCR content' do
expect(object.to_solr.to_a).to include id: 'x/cocina-fileSet-x-y/y.txt',
druid: 'x',
resource_id: 'cocina-fileSet-x-y',
filename: 'y.txt',
ocrtext: ['text text text']
expect(object.to_solr).to include id: 'x/y/y.txt',
druid: 'x',
resource_id: 'y',
filename: 'y.txt',
ocrtext: ['text text text']
end

it 'indexes the published date' do
Expand Down
1 change: 0 additions & 1 deletion spec/models/search_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
it 'kicks off indexing if no results were found' do
client = instance_double(RSolr::Client, get: { 'response' => { 'numFound' => 0 }, 'highlighting' => {} })
allow(described_class).to receive(:client).and_return(client)
allow(HTTP).to receive(:get).and_return(instance_double(HTTP::Response, body: '{}'))
allow(IndexFullTextContentJob).to receive(:perform_now)
search.highlights
expect(IndexFullTextContentJob).to have_received(:perform_now).with('x', commit: true)
Expand Down

0 comments on commit c8adfd3

Please sign in to comment.