Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preparing changes to add format types (classification) from DROID sig file #226

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cmd/roy/data/DROID_SignatureFile_V114.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- THIS FILE WILL BE REPLACED BEFORE MERGE -->
<FFSignatureFile DateCreated="2023-08-18T15:27:49" Version="114" xmlns="http://www.nationalarchives.gov.uk/pronom/SignatureFile">
<InternalSignatureCollection>
<InternalSignature ID="9" Specificity="Specific">
Expand Down Expand Up @@ -52744,7 +52745,7 @@
PUID="fmt/9" Version="5"/>
<FileFormat ID="612" Name="Tagged Image File Format"
PUID="fmt/10" Version="6"/>
<FileFormat ID="613" MIMEType="application/pdf"
<FileFormat FormatType="Page Description" ID="613" MIMEType="application/pdf"
Name="Acrobat PDF 1.0 - Portable Document Format"
PUID="fmt/14" Version="1.0">
<InternalSignatureID>123</InternalSignatureID>
Expand Down Expand Up @@ -52792,7 +52793,7 @@
<HasPriorityOverFileFormatID>687</HasPriorityOverFileFormatID>
<HasPriorityOverFileFormatID>869</HasPriorityOverFileFormatID>
</FileFormat>
<FileFormat ID="619" MIMEType="image/gif"
<FileFormat FormatType="Image (Raster)" ID="619" MIMEType="image/gif"
Name="Graphics Interchange Format" PUID="fmt/3" Version="87a">
<InternalSignatureID>18</InternalSignatureID>
<Extension>gif</Extension>
Expand Down Expand Up @@ -53005,7 +53006,7 @@
<Extension>wav</Extension>
<HasPriorityOverFileFormatID>2741</HasPriorityOverFileFormatID>
</FileFormat>
<FileFormat ID="655" MIMEType="video/x-msvideo"
<FileFormat FormatType="Audio, Video" ID="655" MIMEType="video/x-msvideo"
Name="Audio/Video Interleaved Format" PUID="fmt/5">
<InternalSignatureID>51</InternalSignatureID>
<Extension>avi</Extension>
Expand Down Expand Up @@ -53060,7 +53061,7 @@
<InternalSignatureID>54</InternalSignatureID>
<Extension>wrl</Extension>
</FileFormat>
<FileFormat ID="664" MIMEType="image/png"
<FileFormat FormatType="Image (Raster)" ID="664" MIMEType="image/png"
Name="Portable Network Graphics" PUID="fmt/11" Version="1.0">
<InternalSignatureID>58</InternalSignatureID>
<Extension>png</Extension>
Expand Down Expand Up @@ -53523,7 +53524,7 @@
<InternalSignatureID>127</InternalSignatureID>
<Extension>doc</Extension>
</FileFormat>
<FileFormat ID="735" MIMEType="audio/x-wav"
<FileFormat FormatType="Audio" ID="735" MIMEType="audio/x-wav"
Name="Broadcast WAVE" PUID="fmt/1" Version="0 Generic">
<InternalSignatureID>1032</InternalSignatureID>
<Extension>wav</Extension>
Expand Down Expand Up @@ -58786,7 +58787,7 @@
<InternalSignatureID>1364</InternalSignatureID>
<Extension>exr</Extension>
</FileFormat>
<FileFormat ID="1807" Name="Nearly Raw Raster Data"
<FileFormat FormatType="Image (Raster), Dataset" ID="1807" Name="Nearly Raw Raster Data"
PUID="fmt/1002" Version="1">
<InternalSignatureID>1357</InternalSignatureID>
<Extension>nrrd</Extension>
Expand Down
59 changes: 47 additions & 12 deletions cmd/sf/pronom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type pronomIdentificationTests struct {

var skeletons = make(map[string]*fstest.MapFile)

var minimalPronom = []string{"fmt/1", "fmt/3", "fmt/5", "fmt/11", "fmt/14"}
var minimalPronom = []string{"fmt/1", "fmt/3", "fmt/5", "fmt/11", "fmt/14", "fmt/1002"}

// Populate the global skeletons map from string-based byte-sequences to
// save having to store skeletons on disk and read from them.
Expand Down Expand Up @@ -59,6 +59,7 @@ func makeSkeletons() {
"")
files["fmt-3-signature-id-18.gif"] = "4749463837613b"
files["badf00d.unknown"] = "badf00d"
files["fmt-1002-signature-id-1357.nrrd"] = "4e52524430302e3031"
for key, val := range files {
data, _ := hex.DecodeString(val)
skeletons[key] = &fstest.MapFile{Data: []byte(data)}
Expand Down Expand Up @@ -125,20 +126,21 @@ var pronomIDs = []pronomIdentificationTests{
"Audio, Video",
"extension match avi; byte match at 0, 12",
"",
}, {
"pronom",
"fmt/1002",
"Nearly Raw Raster Data",
"1",
"",
"Image (Raster), Dataset",
"extension match nrrd; byte match at 0, 9",
"",
},
}

// TestPronom looks to see if PRONOM identification results for a
// minimized PRONOM dataset are correct and contain the information we
// anticipate.
func TestPronom(t *testing.T) {
sf := siegfried.New()
config.SetHome(DataPath)
identifier, err := pronom.New(config.SetLimit(minimalPronom))
if err != nil {
t.Errorf("Error creating new PRONOM identifier: %s", err)
}
sf.Add(identifier)
// runIdentificationWithSF provides a number of tests that can be run
// against a Siegfried.
func runIdentificationWithSF(sf *siegfried.Siegfried, t *testing.T) {
makeSkeletons()
skeletonFS := fstest.MapFS(skeletons)
testDirListing, err := skeletonFS.ReadDir(".")
Expand Down Expand Up @@ -182,5 +184,38 @@ func TestPronom(t *testing.T) {
t.Errorf("Results not equal for %s; expected %v; got %v", res.puid, pronomIDs[idx], res)
}
}
}

// TestPronom looks to see if PRONOM identification results for a
// minimized PRONOM dataset are correct and contain the information we
// anticipate.
func TestPronom(t *testing.T) {
sf := siegfried.New()
config.SetHome(DataPath)
identifier, err := pronom.New(config.SetLimit(minimalPronom))
if err != nil {
t.Errorf("Error creating new PRONOM identifier: %s", err)
}
sf.Add(identifier)
runIdentificationWithSF(sf, t)
config.Clear()()
}

// TestPronomNoReports performs the same tests as TestPronom, but
// against a Siegfried created purely from a signature file.
func TestPronomNoReports(t *testing.T) {
sf := siegfried.New()
config.SetHome(DataPath)
config.SetNoContainer()()
config.SetNoReports()()
if config.Reports() != "" {
t.Errorf("pronon.reports should be unset, not: %s", config.Reports())
}
identifier, err := pronom.New(config.SetLimit(minimalPronom))
if err != nil {
t.Errorf("Error creating new PRONOM identifier: %s", err)
}
sf.Add(identifier)
runIdentificationWithSF(sf, t)
config.Clear()()
}
2 changes: 2 additions & 0 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,15 @@ func IsArchive(id string) Archive {
// Clear clears loc and mimeinfo details to avoid pollution when creating multiple identifiers in same session
func Clear() func() private {
return func() private {
identifier.noContainer = false
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently we need a change like this, but we're not then resetting reports to pronom so I am not sure which way is correct of if there is still some pollution somewhere?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resetting reports to pronom probably a good idea too. In principle I'd say it's good to add anything to Clear where you are setting config options that change defaults.

identifier.name = ""
identifier.extend = nil
identifier.limit = nil
identifier.exclude = nil
identifier.multi = Conclusive
loc.fdd = ""
mimeinfo.mi = ""
pronom.reports = "pronom"
return private{}
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/pronom/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ func New(opts ...config.Option) (core.Identifier, error) {
pronom = identifier.ApplyConfig(pronom)
id := &Identifier{
Base: identifier.New(pronom, config.ZipPuid()),
hasClass: config.Reports() != "" && !config.NoClass(),
hasClass: !config.NoClass(),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is all we need to do here when this is possible with the signature file too?

infos: infos(pronom.Infos()),
}
if id.Multi() == config.DROID {
Expand Down
1 change: 1 addition & 0 deletions pkg/pronom/internal/mappings/droid.go
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest dropping the extra comments added here.

Agree absolutely that anything exported should be commented (and much to do here) but these structs aren't part of the public API (hidden by the "internal" path) & are only exported internally for the sake of unmarshalling (the golang Unmarshal funcs can only assign to exported fields).

Given these structs are all just mappings to the XML file it is hard to say anything else meaningful about them apart from what's said at the top of the file - so in this case I think it is best to just keep this mapping file lean and clean (even if linters scream about it!)

Copy link
Collaborator Author

@ross-spencer ross-spencer Apr 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even if linters scream about it!

That's just it:

C:\Users\Spencer\code\git\siegfried\pkg\pronom>%gopath%\bin\golint ./...
internal\mappings\droid.go:22:6: exported type Droid should have comment or be unexported
internal\mappings\droid.go:29:6: exported type InternalSignature should have comment or be unexported
internal\mappings\droid.go:34:6: exported type ByteSeq should have comment or be unexported
internal\mappings\droid.go:39:6: exported type SubSequence should have comment or be unexported
internal\mappings\droid.go:48:6: exported type Fragment should have comment or be unexported
internal\mappings\droid.go:55:6: exported type FileFormat should have comment or be unexported
... ~45 more warnings ...  

So, as with other code you may have noticed, I follow the process of -- if I am in the code, I try to add placeholder comments so that we can say more in future, or see fewer warnings. It's probably more idiomatic not to have any comment to have some comment rather than none, but maybe there are some project level settings that makes the intent to not address these easier for maintainers? I.e. something a linter can pick up and run with?

Given these structs are all just mappings to the XML file it is hard to say anything else meaningful about them apart from what's said at the top of the file

My documentation isn't that useful, but I don't agree everyone has the same knowledge about these objects that means it's a given this resource doesn't add more. Godoc defaults its view to show the layout and comments on internal packages: https://pkg.go.dev/github.com/richardlehane/[email protected]/pkg/pronom/internal/mappings which does lead to interesting possibilities in digipes.

I'll have a think about some sort of pre-commit settings and rules to ignore internal packages, but otherwise this is a simple enough commit to drop when I rebase later on.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there was a discussion of this on the golint issues, e.g. win-t feels my pain: golang/lint#186 (comment)

golint is deprecated now. I'm using staticcheck for my linter these days which I don't think nags about this (though it does have all sorts of other "opinions" about our code!).

we should agree on a common linter and document on the wiki: e.g. a short style guide in the contributors' docs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds reasonable to me and I'm very happy to help draft/review. I keep a number of references/ideas in this area which may be useful.. I'll check out static check as well and see what tooling it offers.

Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ type FileFormat struct {
Name string `xml:",attr"`
Version string `xml:",attr"`
MIMEType string `xml:",attr"`
FormatType string `xml:",attr"`
Extensions []string `xml:"Extension"`
Signatures []int `xml:"InternalSignatureID"`
Priorities []int `xml:"HasPriorityOverFileFormatID"`
Expand Down
1 change: 1 addition & 0 deletions pkg/pronom/parseable.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func (d *droid) Infos() map[string]identifier.FormatInfo {
name: strings.TrimSpace(v.Name),
version: strings.TrimSpace(v.Version),
mimeType: strings.TrimSpace(v.MIMEType),
class: strings.TrimSpace(v.FormatType),
}
}
return infos
Expand Down
53 changes: 40 additions & 13 deletions pkg/pronom/pronom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"sort"
"testing"

"github.com/richardlehane/siegfried/internal/identifier"
"github.com/richardlehane/siegfried/pkg/config"
)

Expand All @@ -21,15 +22,9 @@ func TestNew(t *testing.T) {
}
}

// TestFormatInfos inspects the values loaded into a PRONOM identifier
// from a minimal PRONOM dataset, i.e. fewer than loading all of PRONOM.
func TestFormatInfos(t *testing.T) {
config.SetHome(dataPath)
config.SetLimit(minimalPronom)()
i, err := NewPronom()
if err != nil {
t.Error(err)
}
// verifyIdentifier provides a number of tests that can be run against
// a PRONOM identifier.
func verifyIdentifier(i identifier.Parseable, t *testing.T) {
const minReports int = 5
if len(i.Infos()) != minReports {
t.Error("Unexpected number of reports for PRONOM minimal tests")
Expand Down Expand Up @@ -84,7 +79,7 @@ func TestFormatInfos(t *testing.T) {
sort.Strings(puids)
sort.Strings(expectedPuids)
if !reflect.DeepEqual(puids, expectedPuids) {
t.Error("PUIDs from minimal PRONOM set do not match expected values")
t.Errorf("PUIDs from minimal PRONOM set do not match expected values; expected %v; got %v", puids, expectedPuids)
}
sort.Strings(names)
sort.Strings(expectedNames)
Expand All @@ -94,17 +89,49 @@ func TestFormatInfos(t *testing.T) {
sort.Strings(versions)
sort.Strings(expectedVersions)
if !reflect.DeepEqual(versions, expectedVersions) {
t.Error("Format versions from minimal PRONOM set do not match expected values")
t.Errorf("Format versions from minimal PRONOM set do not match expected values; expected %v; got %v", versions, expectedVersions)
}
sort.Strings(mimes)
sort.Strings(expectedMimes)
if !reflect.DeepEqual(mimes, expectedMimes) {
t.Error("MIMETypes from minimal PRONOM set do not match expected values")
t.Errorf("MIMETypes from minimal PRONOM set do not match expected values; expected %v; got %v", mimes, expectedMimes)
}
sort.Strings(types)
sort.Strings(expectedTypes)
if !reflect.DeepEqual(types, expectedTypes) {
t.Error("Format types from minimal PRONOM set do not match expected values")
t.Errorf("Format types from minimal PRONOM set do not match expected values; expected %v; got %v", types, expectedTypes)
}
}

// TestFormatInfosDefault inspects the values loaded into a PRONOM
// identifier from a minimal PRONOM dataset, i.e. fewer than loading
// all of PRONOM.
func TestFormatInfosDefault(t *testing.T) {
config.SetHome(dataPath)
config.SetLimit(minimalPronom)()
i, err := NewPronom()
if err != nil {
t.Error(err)
}
verifyIdentifier(i, t)
config.Clear()()
}

// TestFormatInfosNoReports performs the same tests as TestFormatInfosDefault
// but does so without loading PRONOM reports, preferring to create an
// identifier using a signature file only.
func TestFormatInfosNoReports(t *testing.T) {
config.SetHome(dataPath)
config.SetLimit(minimalPronom)()
config.SetNoContainer()()
config.SetNoReports()()
if config.Reports() != "" {
t.Errorf("pronon.reports should be unset, not: %s", config.Reports())
}
i, err := NewPronom()
if err != nil {
t.Error(err)
}
verifyIdentifier(i, t)
config.Clear()()
}