diff --git a/README.md b/README.md index ae2b515..dcb5132 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Adds uniref parser [#107](https://github.com/Koeng101/dnadesign/pull/107) +- Fixes iso-8859-1 error in reading uniref data dumps [#106](https://github.com/Koeng101/dnadesign/pull/106) - Updates uniprot parser to read IDs [#104](https://github.com/Koeng101/dnadesign/pull/104) - Fixes RecursiveFragment to not add flanks to the initial input [#102](https://github.com/Koeng101/dnadesign/pull/102) - Fixes add flank bug, releases new version of python lib [#101](https://github.com/Koeng101/dnadesign/pull/101) diff --git a/lib/bio/bio.go b/lib/bio/bio.go index 4fc5e72..bbeac43 100644 --- a/lib/bio/bio.go +++ b/lib/bio/bio.go @@ -24,6 +24,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio/sam" "github.com/koeng101/dnadesign/lib/bio/slow5" "github.com/koeng101/dnadesign/lib/bio/uniprot" + "github.com/koeng101/dnadesign/lib/bio/uniref" ) // Format is a enum of different parser formats. @@ -63,12 +64,12 @@ Lower level interfaces // DataTypes defines the possible data types returned by every parser. type DataTypes interface { - genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry + genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry | uniref.Entry } // HeaderTypes defines the possible header types returned by every parser. type HeaderTypes interface { - genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header + genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header | uniref.Header } // ParserInterface is a generic interface that all parsers must support. It is @@ -171,6 +172,13 @@ func NewUniprotParser(r io.Reader) *Parser[uniprot.Entry, uniprot.Header] { return &Parser[uniprot.Entry, uniprot.Header]{ParserInterface: uniprot.NewParser(r)} } +// NewUnirefParser initiates a new Uniref parser from an io.Reader. No +// maxLineLength is necessary. +func NewUnirefParser(r io.Reader) (*Parser[uniref.Entry, uniref.Header], error) { + parser, err := uniref.NewParser(r) + return &Parser[uniref.Entry, uniref.Header]{ParserInterface: parser}, err +} + /****************************************************************************** Parser higher-level functions diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index 6fbaaaf..a948d7b 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -389,6 +389,45 @@ func ExampleNewUniprotParser() { // Output: P0C9F0 } +func ExampleNewUnirefParser() { + // The following is the first gene of UniRef50 with the sequence truncated. + // We're going to gzip it and put the gzipped text as an io.Reader to mock + // a file. You can edit the text here to see how the parser works. + // + // Note: Unlike the uniprot parser, the uniref parser expects that the file is + // properly terminated with . + uniprotEntryText := strings.NewReader(` + + +Cluster: uncharacterized protein LOC134193701 + + + + + + + + + + + + + +MGR + + +`) + // Now we load the parser, and get the first entry out. + parser, _ := bio.NewUnirefParser(uniprotEntryText) + entry, _ := parser.Next() + + fmt.Println(entry.ID) + // Output: UniRef50_UPI002E2621C6 +} + func ExampleNewSamParser() { // The following can be replaced with a any io.Reader. For example, // `file, err := os.Open(path)` for file would also work. diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go index 63a367a..029a63f 100644 --- a/lib/bio/uniprot/uniprot.go +++ b/lib/bio/uniprot/uniprot.go @@ -28,7 +28,6 @@ import ( "io" "net/http" "net/url" - "strings" ) // Decoder decodes XML elements2 @@ -68,14 +67,6 @@ type Parser struct { func NewParser(r io.Reader) *Parser { decoder := xml.NewDecoder(r) - // Oddly enough, the uniref datasets use iso-8859-1, not UTF-8. So we need - // to incorporate this decoder charset reader. - decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { - if strings.ToLower(charset) == "iso-8859-1" { - return input, nil // ISO-8859-1 bytes can be read directly as UTF-8 - } - return nil, fmt.Errorf("unsupported charset: %s", charset) - } return &Parser{decoder: decoder} } diff --git a/lib/bio/uniref/data/uniref300.xml b/lib/bio/uniref/data/uniref300.xml new file mode 100644 index 0000000..4421dce --- /dev/null +++ b/lib/bio/uniref/data/uniref300.xml @@ -0,0 +1,42 @@ + + + +Cluster: uncharacterized protein LOC134193701 + + + + + + + + + + + + + +MGR + + + +Cluster: LOW QUALITY PROTEIN: titin + + + + + + + + + + + + + +MSEQ + + + diff --git a/lib/bio/uniref/example_test.go b/lib/bio/uniref/example_test.go new file mode 100644 index 0000000..316c18f --- /dev/null +++ b/lib/bio/uniref/example_test.go @@ -0,0 +1,30 @@ +package uniref_test + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/koeng101/dnadesign/lib/bio/uniref" +) + +func Example() { + // Open the gzipped UniRef file + file, _ := os.Open(filepath.Join("data", "uniref300.xml")) + defer file.Close() + + // Create new parser + parser, _ := uniref.NewParser(file) + + // Read and print the first entry + entry, _ := parser.Next() + + fmt.Printf("Entry ID: %s\n", entry.ID) + fmt.Printf("Name: %s\n", entry.Name) + fmt.Printf("Sequence Length: %d\n", entry.RepMember.Sequence.Length) + + // Output: + // Entry ID: UniRef50_UPI002E2621C6 + // Name: Cluster: uncharacterized protein LOC134193701 + // Sequence Length: 49499 +} diff --git a/lib/bio/uniref/uniref.go b/lib/bio/uniref/uniref.go new file mode 100644 index 0000000..c94a8f7 --- /dev/null +++ b/lib/bio/uniref/uniref.go @@ -0,0 +1,139 @@ +/* +Package uniref provides a parser for UniRef XML files. + +UniRef clusters uniprot proteins by similarity. This is useful for doing +bioinformatics on protein space, as many proteins are sequenced a ton of times +in different organisms, and you don't want those proteins to dominate your +training data. + +UniRef data dumps are available as FASTA or XML formatted data. The XML has +more rich data, so we use that. The parser was created using AI. + +UniProt Reference Clusters (UniRef) provide clustered sets of sequences from +the UniProt Knowledgebase (including isoforms) and selected UniParc records in +order to obtain complete coverage of the sequence space at several resolutions +while hiding redundant sequences (but not their descriptions) from view. +(taken from uniref reference https://www.uniprot.org/help/uniref) + +Download uniref data dumps here: https://www.uniprot.org/downloads +*/ +package uniref + +import ( + "bytes" + "encoding/xml" + "fmt" + "io" + "strings" +) + +// Header is an empty struct since UniRef files don't have headers +type Header struct{} + +// Entry represents a UniRef entry +type Entry struct { + XMLName xml.Name `xml:"entry"` + ID string `xml:"id,attr"` + Updated string `xml:"updated,attr"` + Name string `xml:"name"` + Properties []Property `xml:"property"` + RepMember RepresentativeMember `xml:"representativeMember"` + Members []Member `xml:"member"` +} + +// Property represents a property element +type Property struct { + Type string `xml:"type,attr"` + Value string `xml:"value,attr"` +} + +// DBReference represents a database reference +type DBReference struct { + Type string `xml:"type,attr"` + ID string `xml:"id,attr"` + Properties []Property `xml:"property"` +} + +// Sequence represents a sequence element +type Sequence struct { + Length int `xml:"length,attr"` + Checksum string `xml:"checksum,attr"` + Value string `xml:",chardata"` +} + +// Member represents a member element +type Member struct { + DBRef DBReference `xml:"dbReference"` + Sequence *Sequence `xml:"sequence"` +} + +// RepresentativeMember represents the representative member +type RepresentativeMember Member + +// UniRef represents the root element +type UniRef struct { + XMLName xml.Name `xml:"UniRef50"` + ReleaseDate string `xml:"releaseDate,attr"` + Version string `xml:"version,attr"` + Entries []Entry `xml:"entry"` +} + +type Parser struct { + decoder *xml.Decoder + uniref *UniRef + current int +} + +func NewParser(r io.Reader) (*Parser, error) { + decoder := xml.NewDecoder(r) + decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { + if strings.ToLower(charset) == "iso-8859-1" { + return input, nil + } + return nil, fmt.Errorf("unsupported charset: %s", charset) + } + + return &Parser{ + decoder: decoder, + current: -1, + }, nil +} + +// Header returns an empty header since UniRef files don't have headers +func (p *Parser) Header() (Header, error) { + return Header{}, nil +} + +// Next returns the next Entry from the UniRef file +func (p *Parser) Next() (Entry, error) { + // First time reading + if p.uniref == nil { + p.uniref = &UniRef{} + if err := p.decoder.Decode(p.uniref); err != nil { + return Entry{}, err + } + p.current = 0 + } + + // Check if we've reached the end of entries + if p.current >= len(p.uniref.Entries) { + return Entry{}, io.EOF + } + + // Get current entry and increment counter + entry := p.uniref.Entries[p.current] + p.current++ + + return entry, nil +} + +// ToXML converts an Entry back to its XML representation +func (e *Entry) ToXML() (string, error) { + buf := new(bytes.Buffer) + enc := xml.NewEncoder(buf) + enc.Indent("", " ") + if err := enc.Encode(e); err != nil { + return "", err + } + return buf.String(), nil +} diff --git a/lib/bio/uniref/uniref_test.go b/lib/bio/uniref/uniref_test.go new file mode 100644 index 0000000..5c38cf2 --- /dev/null +++ b/lib/bio/uniref/uniref_test.go @@ -0,0 +1,233 @@ +package uniref + +import ( + "io" + "strings" + "testing" +) + +// Test data +const testData = ` + + +Cluster: uncharacterized protein LOC134193701 + + + + + + + + + + + + + +MGR + + + +Cluster: LOW QUALITY PROTEIN: titin + + + + + + + + + + + + + +MSEQ + + +` + +func TestUniRefParser(t *testing.T) { + tests := []struct { + name string + testFunc func(*testing.T) + }{ + {"TestBasicParsing", testBasicParsing}, + {"TestEmptyHeader", testEmptyHeader}, + {"TestSequentialReading", testSequentialReading}, + {"TestXMLExport", testXMLExport}, + {"TestPropertyAccess", testPropertyAccess}, + {"TestSequenceData", testSequenceData}, + } + + for _, tt := range tests { + t.Run(tt.name, tt.testFunc) + } +} + +func testBasicParsing(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + entry, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse first entry: %v", err) + } + + // Test first entry + if entry.ID != "UniRef50_UPI002E2621C6" { + t.Errorf("Expected ID UniRef50_UPI002E2621C6, got %s", entry.ID) + } + if entry.Name != "Cluster: uncharacterized protein LOC134193701" { + t.Errorf("Expected name 'Cluster: uncharacterized protein LOC134193701', got %s", entry.Name) + } + if len(entry.Properties) != 3 { + t.Errorf("Expected 3 properties, got %d", len(entry.Properties)) + } +} + +func testEmptyHeader(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + header, err := parser.Header() + if err != nil { + t.Errorf("Expected no error for empty header, got %v", err) + } + if header != (Header{}) { + t.Error("Expected empty header struct") + } +} + +func testSequentialReading(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + // First entry + entry1, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse first entry: %v", err) + } + if entry1.ID != "UniRef50_UPI002E2621C6" { + t.Errorf("First entry: expected ID UniRef50_UPI002E2621C6, got %s", entry1.ID) + } + + // Second entry + entry2, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse second entry: %v", err) + } + if entry2.ID != "UniRef50_UPI00358F51CD" { + t.Errorf("Second entry: expected ID UniRef50_UPI00358F51CD, got %s", entry2.ID) + } + + // Should be EOF now + _, err = parser.Next() + if err != io.EOF { + t.Errorf("Expected EOF after second entry, got %v", err) + } +} + +func testXMLExport(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + entry, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse entry: %v", err) + } + + xml, err := entry.ToXML() + if err != nil { + t.Fatalf("Failed to export XML: %v", err) + } + + // Test that exported XML contains key elements + expectedElements := []string{ + `id="UniRef50_UPI002E2621C6"`, + `updated="2024-05-29"`, + `Cluster: uncharacterized protein LOC134193701`, + `checksum="428270C7C0D6A56C"`, + `>MGR`, + } + + for _, expected := range expectedElements { + if !strings.Contains(xml, expected) { + t.Errorf("Expected XML to contain '%s', but it didn't", expected) + } + } +} + +func testPropertyAccess(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + entry, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse entry: %v", err) + } + + // Test property access + if len(entry.Properties) == 0 { + t.Fatal("Expected properties to be present") + } + + // Check specific property values + memberCountFound := false + for _, prop := range entry.Properties { + if prop.Type == "member count" && prop.Value == "1" { + memberCountFound = true + break + } + } + if !memberCountFound { + t.Error("Expected to find member count property with value '1'") + } +} + +func testSequenceData(t *testing.T) { + parser, err := NewParser(strings.NewReader(testData)) + if err != nil { + t.Fatalf("Failed to create parser: %v", err) + } + + entry, err := parser.Next() + if err != nil { + t.Fatalf("Failed to parse entry: %v", err) + } + + // Test sequence data + sequence := entry.RepMember.Sequence + if sequence == nil { + t.Fatal("Expected sequence to be present") + } + + expectedTests := []struct { + name string + got interface{} + expected interface{} + }{ + {"Length", sequence.Length, 49499}, + {"Checksum", sequence.Checksum, "428270C7C0D6A56C"}, + {"Value", sequence.Value, "MGR"}, + } + + for _, tt := range expectedTests { + if tt.got != tt.expected { + t.Errorf("%s: expected %v, got %v", tt.name, tt.expected, tt.got) + } + } +}