diff --git a/README.md b/README.md
index ae2b515..dcb5132 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
+- Adds uniref parser [#107](https://github.com/Koeng101/dnadesign/pull/107)
+- Fixes iso-8859-1 error in reading uniref data dumps [#106](https://github.com/Koeng101/dnadesign/pull/106)
- Updates uniprot parser to read IDs [#104](https://github.com/Koeng101/dnadesign/pull/104)
- Fixes RecursiveFragment to not add flanks to the initial input [#102](https://github.com/Koeng101/dnadesign/pull/102)
- Fixes add flank bug, releases new version of python lib [#101](https://github.com/Koeng101/dnadesign/pull/101)
diff --git a/lib/bio/bio.go b/lib/bio/bio.go
index 4fc5e72..bbeac43 100644
--- a/lib/bio/bio.go
+++ b/lib/bio/bio.go
@@ -24,6 +24,7 @@ import (
"github.com/koeng101/dnadesign/lib/bio/sam"
"github.com/koeng101/dnadesign/lib/bio/slow5"
"github.com/koeng101/dnadesign/lib/bio/uniprot"
+ "github.com/koeng101/dnadesign/lib/bio/uniref"
)
// Format is a enum of different parser formats.
@@ -63,12 +64,12 @@ Lower level interfaces
// DataTypes defines the possible data types returned by every parser.
type DataTypes interface {
- genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry
+ genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry | uniref.Entry
}
// HeaderTypes defines the possible header types returned by every parser.
type HeaderTypes interface {
- genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header
+ genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header | uniref.Header
}
// ParserInterface is a generic interface that all parsers must support. It is
@@ -171,6 +172,13 @@ func NewUniprotParser(r io.Reader) *Parser[uniprot.Entry, uniprot.Header] {
return &Parser[uniprot.Entry, uniprot.Header]{ParserInterface: uniprot.NewParser(r)}
}
+// NewUnirefParser initiates a new Uniref parser from an io.Reader. No
+// maxLineLength is necessary.
+func NewUnirefParser(r io.Reader) (*Parser[uniref.Entry, uniref.Header], error) {
+ parser, err := uniref.NewParser(r)
+ return &Parser[uniref.Entry, uniref.Header]{ParserInterface: parser}, err
+}
+
/******************************************************************************
Parser higher-level functions
diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go
index 6fbaaaf..a948d7b 100644
--- a/lib/bio/example_test.go
+++ b/lib/bio/example_test.go
@@ -389,6 +389,45 @@ func ExampleNewUniprotParser() {
// Output: P0C9F0
}
+func ExampleNewUnirefParser() {
+ // The following is the first gene of UniRef50 with the sequence truncated.
+ // We're going to gzip it and put the gzipped text as an io.Reader to mock
+ // a file. You can edit the text here to see how the parser works.
+ //
+ // Note: Unlike the uniprot parser, the uniref parser expects that the file is
+ // properly terminated with .
+ uniprotEntryText := strings.NewReader(`
+
+
+Cluster: uncharacterized protein LOC134193701
+
+
+
+
+
+
+
+
+
+
+
+
+
+MGR
+
+
+`)
+ // Now we load the parser, and get the first entry out.
+ parser, _ := bio.NewUnirefParser(uniprotEntryText)
+ entry, _ := parser.Next()
+
+ fmt.Println(entry.ID)
+ // Output: UniRef50_UPI002E2621C6
+}
+
func ExampleNewSamParser() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go
index 63a367a..029a63f 100644
--- a/lib/bio/uniprot/uniprot.go
+++ b/lib/bio/uniprot/uniprot.go
@@ -28,7 +28,6 @@ import (
"io"
"net/http"
"net/url"
- "strings"
)
// Decoder decodes XML elements2
@@ -68,14 +67,6 @@ type Parser struct {
func NewParser(r io.Reader) *Parser {
decoder := xml.NewDecoder(r)
- // Oddly enough, the uniref datasets use iso-8859-1, not UTF-8. So we need
- // to incorporate this decoder charset reader.
- decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
- if strings.ToLower(charset) == "iso-8859-1" {
- return input, nil // ISO-8859-1 bytes can be read directly as UTF-8
- }
- return nil, fmt.Errorf("unsupported charset: %s", charset)
- }
return &Parser{decoder: decoder}
}
diff --git a/lib/bio/uniref/data/uniref300.xml b/lib/bio/uniref/data/uniref300.xml
new file mode 100644
index 0000000..4421dce
--- /dev/null
+++ b/lib/bio/uniref/data/uniref300.xml
@@ -0,0 +1,42 @@
+
+
+
+Cluster: uncharacterized protein LOC134193701
+
+
+
+
+
+
+
+
+
+
+
+
+
+MGR
+
+
+
+Cluster: LOW QUALITY PROTEIN: titin
+
+
+
+
+
+
+
+
+
+
+
+
+
+MSEQ
+
+
+
diff --git a/lib/bio/uniref/example_test.go b/lib/bio/uniref/example_test.go
new file mode 100644
index 0000000..316c18f
--- /dev/null
+++ b/lib/bio/uniref/example_test.go
@@ -0,0 +1,30 @@
+package uniref_test
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "github.com/koeng101/dnadesign/lib/bio/uniref"
+)
+
+func Example() {
+ // Open the gzipped UniRef file
+ file, _ := os.Open(filepath.Join("data", "uniref300.xml"))
+ defer file.Close()
+
+ // Create new parser
+ parser, _ := uniref.NewParser(file)
+
+ // Read and print the first entry
+ entry, _ := parser.Next()
+
+ fmt.Printf("Entry ID: %s\n", entry.ID)
+ fmt.Printf("Name: %s\n", entry.Name)
+ fmt.Printf("Sequence Length: %d\n", entry.RepMember.Sequence.Length)
+
+ // Output:
+ // Entry ID: UniRef50_UPI002E2621C6
+ // Name: Cluster: uncharacterized protein LOC134193701
+ // Sequence Length: 49499
+}
diff --git a/lib/bio/uniref/uniref.go b/lib/bio/uniref/uniref.go
new file mode 100644
index 0000000..c94a8f7
--- /dev/null
+++ b/lib/bio/uniref/uniref.go
@@ -0,0 +1,139 @@
+/*
+Package uniref provides a parser for UniRef XML files.
+
+UniRef clusters uniprot proteins by similarity. This is useful for doing
+bioinformatics on protein space, as many proteins are sequenced a ton of times
+in different organisms, and you don't want those proteins to dominate your
+training data.
+
+UniRef data dumps are available as FASTA or XML formatted data. The XML has
+more rich data, so we use that. The parser was created using AI.
+
+UniProt Reference Clusters (UniRef) provide clustered sets of sequences from
+the UniProt Knowledgebase (including isoforms) and selected UniParc records in
+order to obtain complete coverage of the sequence space at several resolutions
+while hiding redundant sequences (but not their descriptions) from view.
+(taken from uniref reference https://www.uniprot.org/help/uniref)
+
+Download uniref data dumps here: https://www.uniprot.org/downloads
+*/
+package uniref
+
+import (
+ "bytes"
+ "encoding/xml"
+ "fmt"
+ "io"
+ "strings"
+)
+
+// Header is an empty struct since UniRef files don't have headers
+type Header struct{}
+
+// Entry represents a UniRef entry
+type Entry struct {
+ XMLName xml.Name `xml:"entry"`
+ ID string `xml:"id,attr"`
+ Updated string `xml:"updated,attr"`
+ Name string `xml:"name"`
+ Properties []Property `xml:"property"`
+ RepMember RepresentativeMember `xml:"representativeMember"`
+ Members []Member `xml:"member"`
+}
+
+// Property represents a property element
+type Property struct {
+ Type string `xml:"type,attr"`
+ Value string `xml:"value,attr"`
+}
+
+// DBReference represents a database reference
+type DBReference struct {
+ Type string `xml:"type,attr"`
+ ID string `xml:"id,attr"`
+ Properties []Property `xml:"property"`
+}
+
+// Sequence represents a sequence element
+type Sequence struct {
+ Length int `xml:"length,attr"`
+ Checksum string `xml:"checksum,attr"`
+ Value string `xml:",chardata"`
+}
+
+// Member represents a member element
+type Member struct {
+ DBRef DBReference `xml:"dbReference"`
+ Sequence *Sequence `xml:"sequence"`
+}
+
+// RepresentativeMember represents the representative member
+type RepresentativeMember Member
+
+// UniRef represents the root element
+type UniRef struct {
+ XMLName xml.Name `xml:"UniRef50"`
+ ReleaseDate string `xml:"releaseDate,attr"`
+ Version string `xml:"version,attr"`
+ Entries []Entry `xml:"entry"`
+}
+
+type Parser struct {
+ decoder *xml.Decoder
+ uniref *UniRef
+ current int
+}
+
+func NewParser(r io.Reader) (*Parser, error) {
+ decoder := xml.NewDecoder(r)
+ decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
+ if strings.ToLower(charset) == "iso-8859-1" {
+ return input, nil
+ }
+ return nil, fmt.Errorf("unsupported charset: %s", charset)
+ }
+
+ return &Parser{
+ decoder: decoder,
+ current: -1,
+ }, nil
+}
+
+// Header returns an empty header since UniRef files don't have headers
+func (p *Parser) Header() (Header, error) {
+ return Header{}, nil
+}
+
+// Next returns the next Entry from the UniRef file
+func (p *Parser) Next() (Entry, error) {
+ // First time reading
+ if p.uniref == nil {
+ p.uniref = &UniRef{}
+ if err := p.decoder.Decode(p.uniref); err != nil {
+ return Entry{}, err
+ }
+ p.current = 0
+ }
+
+ // Check if we've reached the end of entries
+ if p.current >= len(p.uniref.Entries) {
+ return Entry{}, io.EOF
+ }
+
+ // Get current entry and increment counter
+ entry := p.uniref.Entries[p.current]
+ p.current++
+
+ return entry, nil
+}
+
+// ToXML converts an Entry back to its XML representation
+func (e *Entry) ToXML() (string, error) {
+ buf := new(bytes.Buffer)
+ enc := xml.NewEncoder(buf)
+ enc.Indent("", " ")
+ if err := enc.Encode(e); err != nil {
+ return "", err
+ }
+ return buf.String(), nil
+}
diff --git a/lib/bio/uniref/uniref_test.go b/lib/bio/uniref/uniref_test.go
new file mode 100644
index 0000000..5c38cf2
--- /dev/null
+++ b/lib/bio/uniref/uniref_test.go
@@ -0,0 +1,233 @@
+package uniref
+
+import (
+ "io"
+ "strings"
+ "testing"
+)
+
+// Test data
+const testData = `
+
+
+Cluster: uncharacterized protein LOC134193701
+
+
+
+
+
+
+
+
+
+
+
+
+
+MGR
+
+
+
+Cluster: LOW QUALITY PROTEIN: titin
+
+
+
+
+
+
+
+
+
+
+
+
+
+MSEQ
+
+
+`
+
+func TestUniRefParser(t *testing.T) {
+ tests := []struct {
+ name string
+ testFunc func(*testing.T)
+ }{
+ {"TestBasicParsing", testBasicParsing},
+ {"TestEmptyHeader", testEmptyHeader},
+ {"TestSequentialReading", testSequentialReading},
+ {"TestXMLExport", testXMLExport},
+ {"TestPropertyAccess", testPropertyAccess},
+ {"TestSequenceData", testSequenceData},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, tt.testFunc)
+ }
+}
+
+func testBasicParsing(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ entry, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse first entry: %v", err)
+ }
+
+ // Test first entry
+ if entry.ID != "UniRef50_UPI002E2621C6" {
+ t.Errorf("Expected ID UniRef50_UPI002E2621C6, got %s", entry.ID)
+ }
+ if entry.Name != "Cluster: uncharacterized protein LOC134193701" {
+ t.Errorf("Expected name 'Cluster: uncharacterized protein LOC134193701', got %s", entry.Name)
+ }
+ if len(entry.Properties) != 3 {
+ t.Errorf("Expected 3 properties, got %d", len(entry.Properties))
+ }
+}
+
+func testEmptyHeader(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ header, err := parser.Header()
+ if err != nil {
+ t.Errorf("Expected no error for empty header, got %v", err)
+ }
+ if header != (Header{}) {
+ t.Error("Expected empty header struct")
+ }
+}
+
+func testSequentialReading(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ // First entry
+ entry1, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse first entry: %v", err)
+ }
+ if entry1.ID != "UniRef50_UPI002E2621C6" {
+ t.Errorf("First entry: expected ID UniRef50_UPI002E2621C6, got %s", entry1.ID)
+ }
+
+ // Second entry
+ entry2, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse second entry: %v", err)
+ }
+ if entry2.ID != "UniRef50_UPI00358F51CD" {
+ t.Errorf("Second entry: expected ID UniRef50_UPI00358F51CD, got %s", entry2.ID)
+ }
+
+ // Should be EOF now
+ _, err = parser.Next()
+ if err != io.EOF {
+ t.Errorf("Expected EOF after second entry, got %v", err)
+ }
+}
+
+func testXMLExport(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ entry, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse entry: %v", err)
+ }
+
+ xml, err := entry.ToXML()
+ if err != nil {
+ t.Fatalf("Failed to export XML: %v", err)
+ }
+
+ // Test that exported XML contains key elements
+ expectedElements := []string{
+ `id="UniRef50_UPI002E2621C6"`,
+ `updated="2024-05-29"`,
+ `Cluster: uncharacterized protein LOC134193701`,
+ `checksum="428270C7C0D6A56C"`,
+ `>MGR`,
+ }
+
+ for _, expected := range expectedElements {
+ if !strings.Contains(xml, expected) {
+ t.Errorf("Expected XML to contain '%s', but it didn't", expected)
+ }
+ }
+}
+
+func testPropertyAccess(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ entry, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse entry: %v", err)
+ }
+
+ // Test property access
+ if len(entry.Properties) == 0 {
+ t.Fatal("Expected properties to be present")
+ }
+
+ // Check specific property values
+ memberCountFound := false
+ for _, prop := range entry.Properties {
+ if prop.Type == "member count" && prop.Value == "1" {
+ memberCountFound = true
+ break
+ }
+ }
+ if !memberCountFound {
+ t.Error("Expected to find member count property with value '1'")
+ }
+}
+
+func testSequenceData(t *testing.T) {
+ parser, err := NewParser(strings.NewReader(testData))
+ if err != nil {
+ t.Fatalf("Failed to create parser: %v", err)
+ }
+
+ entry, err := parser.Next()
+ if err != nil {
+ t.Fatalf("Failed to parse entry: %v", err)
+ }
+
+ // Test sequence data
+ sequence := entry.RepMember.Sequence
+ if sequence == nil {
+ t.Fatal("Expected sequence to be present")
+ }
+
+ expectedTests := []struct {
+ name string
+ got interface{}
+ expected interface{}
+ }{
+ {"Length", sequence.Length, 49499},
+ {"Checksum", sequence.Checksum, "428270C7C0D6A56C"},
+ {"Value", sequence.Value, "MGR"},
+ }
+
+ for _, tt := range expectedTests {
+ if tt.got != tt.expected {
+ t.Errorf("%s: expected %v, got %v", tt.name, tt.expected, tt.got)
+ }
+ }
+}