Merge pull request #552 from sul-dlss/revert-548-cocina

Revert "Use cocina instead of public xml to drive content search indexing."
sul-dlss · Aug 7, 2024 · c8adfd3 · c8adfd3
2 parents 312ba34 + 18ec530
commit c8adfd3
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 88 deletions.
diff --git a/app/models/purl_object.rb b/app/models/purl_object.rb
@@ -17,15 +17,17 @@ def initialize(druid)
     @druid = druid
   end
 
+  def resources
+    public_xml.xpath('//contentMetadata/resource')
+  end
+
   def ocr_files
     return to_enum(:ocr_files) unless block_given?
 
-    resource_files.each do |file, file_set|
-      next unless file['use'] == 'transcription'
-      next unless file['hasMimeType'].in?(['application/xml', 'application/alto+xml', 'text/plain'])
-      next unless file['size'].to_i <= Settings.maximum_ocr_filesize_to_consider
-
-      yield PurlObject::File.new(druid, file, file_set.except('structural'))
+    resources.each do |r|
+      r.xpath('file[@role="transcription"][@mimetype="application/xml" or @mimetype="application/alto+xml" or @mimetype="text/plain"]').each do |file|
+        yield file unless file['size'].to_i > Settings.maximum_ocr_filesize_to_consider
+      end
     end
   end
 
@@ -35,14 +37,16 @@ def to_solr(options = { in_threads: 8 })
     # Inject "bookkeeping" document into index first to record last published date
     yield({ id: druid, druid: druid, published: published, resource_id: 'druid' })
 
-    results = Parallel.map(ocr_files, options, &:to_solr)
+    results = Parallel.map(ocr_files, options) do |file|
+      PurlObject::File.new(druid, file).to_solr
+    end
 
     # preserving the stream-like API for now..
     results.each { |r| yield r unless r.nil? }
   end
 
   def published
-    public_cocina['modified']
+    public_xml.root['published']
   end
 
   private
@@ -51,23 +55,11 @@ def fetch(url)
     self.class.client.get(url).body.to_s
   end
 
-  def public_cocina
-    @public_cocina ||= JSON.parse(public_cocina_body)
+  def public_xml
+    @public_xml ||= Nokogiri::XML.parse(public_xml_body)
   end
 
-  def public_cocina_body
-    fetch(format(Settings.purl.public_cocina_url, druid: druid))
-  end
-
-  def resource_files
-    return to_enum(:resource_files) unless block_given?
-
-    public_cocina.dig('structural', 'contains')&.each do |file_set|
-      file_set.dig('structural', 'contains').each do |file|
-        next unless file.dig('administrative', 'shelve')
-
-        yield file, file_set
-      end
-    end
+  def public_xml_body
+    fetch(format(Settings.purl.public_xml_url, druid: druid))
   end
 end
diff --git a/app/models/purl_object/file.rb b/app/models/purl_object/file.rb
@@ -3,28 +3,27 @@
 class PurlObject
   # File object within a PURL document
   class File
-    attr_reader :druid, :file_metadata, :fileset_metadata
+    attr_reader :druid, :file_xml_fragment
 
     def self.client
       Thread.current[:client] ||= HTTP.persistent(Settings.stacks.host)
     end
 
-    def initialize(druid, file_metadata, fileset_metadata = {})
+    def initialize(druid, file_xml_fragment)
       @druid = druid
-      @file_metadata = file_metadata
-      @fileset_metadata = fileset_metadata
+      @file_xml_fragment = file_xml_fragment
     end
 
     def resource_id
-      fileset_metadata['externalIdentifier']&.sub('https://cocina.sul.stanford.edu/fileSet/', 'cocina-fileSet-')
+      file_xml_fragment.xpath('..').first.attr('id')
     end
 
     def filename
-      file_metadata['filename']
+      file_xml_fragment.attr('id')
     end
 
     def mimetype
-      file_metadata['hasMimeType']
+      file_xml_fragment.attr('mimetype')
     end
 
     def file_url

diff --git a/config/settings.yml b/config/settings.yml
@@ -6,7 +6,6 @@ maximum_ocr_filesize_to_consider: 10000000 # 10MB
 purl:
   canvas_url: 'https://purl.stanford.edu/%{druid}/iiif/canvas/%{resource}'
   public_xml_url: 'https://purl.stanford.edu/%{druid}.xml'
-  public_cocina_url: 'https://purl.stanford.edu/%{druid}.json'
 
 stacks:
   host: 'https://stacks.stanford.edu'
@@ -23,4 +22,4 @@ solr:
 
 kafka:
   topic: testing_topic # Can be purl_fetcher_stage or purl_fetcher_prod
-  group_id: content-search
+  group_id: content-search
diff --git a/spec/models/purl_object/file_spec.rb b/spec/models/purl_object/file_spec.rb
@@ -3,14 +3,14 @@
 require 'rails_helper'
 
 RSpec.describe PurlObject::File do
-  subject(:purl_file) { described_class.new('somedruid', file_metadata, fileset_metadata) }
+  subject(:purl_file) { described_class.new('somedruid', file_xml_fragment) }
 
-  let(:file_metadata) { { filename: '92280263.xml', hasMimeType: 'application/xml', size: 46424 }.with_indifferent_access }
-  let(:fileset_metadata) { { externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/abc123' }.with_indifferent_access }
+  let(:file_xml_fragment) { Nokogiri::XML.parse(xml).root }
+  let(:xml) { '<file id="92280263.xml" mimetype="application/xml" size="46424"></file>' }
 
   describe '#file_url' do
     context 'with a file with a space' do
-      let(:file_metadata) { { filename: 'Read Me', hasMimeType: 'application/xml', size: 46424 }.with_indifferent_access }
+      let(:xml) { '<file id="Read Me" mimetype="application/xml" size="46424"></file>' }
 
       it 'URI escapes the file name' do
         expect(purl_file.file_url).to eq 'https://stacks.stanford.edu/file/somedruid/Read+Me'

diff --git a/spec/models/purl_object_spec.rb b/spec/models/purl_object_spec.rb
@@ -5,52 +5,25 @@
 RSpec.describe PurlObject do
   subject(:object) { described_class.new('x') }
 
-  let(:public_cocina_json) do
-    {
-      modified: '2021-05-26T23:52:08Z',
-      structural: {
-        contains: [
-          {
-            externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/x-y',
-            structural: {
-              contains: [
-                {
-                  filename: 'y.txt',
-                  hasMimeType: 'text/plain',
-                  use: 'transcription',
-                  administrative: {
-                    shelve: true
-                  }
-                }
-              ]
-            }
-          },
-          {
-            externalIdentifier: 'https://cocina.sul.stanford.edu/fileSet/x-oversize',
-            structural: {
-              contains: [
-                {
-                  filename: 'oversize.txt',
-                  size: 100.gigabytes,
-                  hasMimeType: 'text/plain',
-                  use: 'transcription',
-                  administrative: {
-                    shelve: true
-                  }
-                }
-              ]
-            }
-          }
-        ]
-      }
-    }.to_json
+  let(:public_xml) do
+    <<-XML
+    <publicObject published="2021-05-26T23:52:08Z">
+      <contentMetadata>
+        <resource id="y">
+          <file id="y.txt" mimetype="text/plain" role="transcription" />
+        </resource>
+        <resource id="oversize">
+          <file id="oversize.txt" size="#{100.gigabytes}" mimetype="text/plain" />
+        </resource>
+      </contentMetadata>
+    </publicObject>
+    XML
   end
-
-  let(:public_cocina_response) { instance_double(HTTP::Response, body: public_cocina_json) }
+  let(:public_xml_response) { instance_double(HTTP::Response, body: public_xml) }
 
   before do
-    purl_url = 'https://purl.stanford.edu/x.json'
-    allow(described_class.client).to receive(:get).with(purl_url).and_return(public_cocina_response)
+    purl_url = 'https://purl.stanford.edu/x.xml'
+    allow(described_class.client).to receive(:get).with(purl_url).and_return(public_xml_response)
   end
 
   describe '#published' do
@@ -59,13 +32,19 @@
     end
   end
 
+  describe '#resources' do
+    it 'extracts resources from the contentMetadata' do
+      expect(object.resources.map { |file| file['id'] }).to match_array %w[y oversize]
+    end
+  end
+
   describe '#ocr_files' do
     it 'has resources that are potentially OCR' do
-      expect(object.ocr_files.map(&:filename)).to include 'y.txt'
+      expect(object.ocr_files.map { |file| file['id'] }).to include 'y.txt'
     end
 
     it 'excludes resources that are unlikely to be OCR' do
-      expect(object.ocr_files.map(&:filename)).not_to include 'oversize.txt'
+      expect(object.ocr_files.map { |file| file['id'] }).not_to include 'oversize.txt'
     end
   end
 
@@ -82,11 +61,11 @@
     end
 
     it 'creates an indexable hash of OCR content' do
-      expect(object.to_solr.to_a).to include id: 'x/cocina-fileSet-x-y/y.txt',
-                                             druid: 'x',
-                                             resource_id: 'cocina-fileSet-x-y',
-                                             filename: 'y.txt',
-                                             ocrtext: ['text text text']
+      expect(object.to_solr).to include id: 'x/y/y.txt',
+                                        druid: 'x',
+                                        resource_id: 'y',
+                                        filename: 'y.txt',
+                                        ocrtext: ['text text text']
     end
 
     it 'indexes the published date' do

diff --git a/spec/models/search_spec.rb b/spec/models/search_spec.rb
@@ -30,7 +30,6 @@
     it 'kicks off indexing if no results were found' do
       client = instance_double(RSolr::Client, get: { 'response' => { 'numFound' => 0 }, 'highlighting' => {} })
       allow(described_class).to receive(:client).and_return(client)
-      allow(HTTP).to receive(:get).and_return(instance_double(HTTP::Response, body: '{}'))
       allow(IndexFullTextContentJob).to receive(:perform_now)
       search.highlights
       expect(IndexFullTextContentJob).to have_received(:perform_now).with('x', commit: true)