diff --git a/bpbio.nimble b/bpbio.nimble index dc80682..265f33c 100644 --- a/bpbio.nimble +++ b/bpbio.nimble @@ -1,5 +1,5 @@ # Package -version = "0.1.5" +version = "0.1.7" author = "Brent Pedersen" description = "collection of command-line tools and small utility functions for genomics" license = "MIT" diff --git a/src/bpbiopkg/info.nim b/src/bpbiopkg/info.nim index 99dcb32..bb81c00 100644 --- a/src/bpbiopkg/info.nim +++ b/src/bpbiopkg/info.nim @@ -1 +1 @@ -var version* = "0.1.6" +var version* = "0.1.7" diff --git a/src/bpbiopkg/pedfile.nim b/src/bpbiopkg/pedfile.nim index c368256..a403b34 100644 --- a/src/bpbiopkg/pedfile.nim +++ b/src/bpbiopkg/pedfile.nim @@ -43,39 +43,6 @@ proc spouse*(s:Sample): Sample {.inline.} = if k.dad == s: return k.mom return k.dad -proc addAncestors(q: var Deque[Sample], o:Sample) = - if o.mom != nil: - q.addLast(o.mom) - if o.dad != nil: - q.addLast(o.dad) - if o.mom != nil: - q.addAncestors(o.mom) - if o.dad != nil: - q.addAncestors(o.dad) - -proc successors(o:Sample, samples: var seq[Sample]) = - ## add kids and kids of kids - for kid in o.kids: - samples.add(kid) - for kid in o.kids: - successors(kid, samples) - -proc successors(o:Sample): seq[Sample] = - successors(o, result) - -proc ancestors(o:Sample, samples:var seq[Sample]) = - if o.mom != nil: - samples.add(o.mom) - if o.dad != nil: - samples.add(o.dad) - if o.mom != nil: - ancestors(o.mom, samples) - if o.dad != nil: - ancestors(o.dad, samples) - -proc ancestors(o:Sample): seq[Sample] = - ancestors(o, result) - proc hash*(s:Sample): Hash = var h: Hash = 0 @@ -83,140 +50,96 @@ proc hash*(s:Sample): Hash = h = h !& hash(s.id) result = !$h -proc lowest_common_ancestors(samples:seq[Sample], a:Sample, b:Sample): seq[Sample] = - var all_ancestors = initSet[Sample](8) - var common_ancestors = initSet[Sample](2) - for i, target in @[a, b]: - var parents = initSet[Sample](8) - var q = initDeque[Sample]() - - q.addFirst(target) - while q.len > 0: - var n = q.popFirst - if parents.contains(n): continue - # A predecessor of n is a node m such that there exists a directed edge from m to n. - # A successor of n is a node m such that there exists a directed edge from n to m. - # - # TODO: might need to flip addKids and ancestors. - q.addAncestors(n) - parents.incl(n) - - if i == 0: - common_ancestors.incl(parents) - else: - common_ancestors = common_ancestors.intersection(parents) - all_ancestors.incl(parents) - - - for p in common_ancestors: - for child in p.successors: - if not common_ancestors.contains(child) and child in all_ancestors: - result.add(p) - break - -type dsample = object - s: Sample - dist: int - -proc dijkstra(samples:seq[Sample], a:Sample, b:Sample): int = - ## return the shortest path from a to b. in this case a must be - ## an predecessor (older than) b. - ## see: https://github.com/mburst/dijkstras-algorithm/blob/master/dijkstras.py - var dsamples = newSeqOfCap[dsample](4) - var previous = initTable[Sample, Sample](4) - if a.family_id != b.family_id: return -1 - - var h = newHeap[dsample]() do (a, b: dsample) -> int: - return a.dist - b.dist - - for v in samples: - if v.family_id != a.family_id: continue - if v == a: dsamples.add(dsample(s:v, dist:0)) - else: dsamples.add(dsample(s:v, dist:samples.len + 2)) - h.push(dsamples[dsamples.high]) - - while h.size > 0: - var dsmallest = h.pop() - var smallest = dsmallest.s - if smallest == b: - if not previous.contains(smallest): return -1 - while previous.contains(smallest): - result += 1 - smallest = previous[smallest] - return - - if dsmallest.dist == samples.len + 2: break - - for okid in dsmallest.s.kids: - # TODO: make this more efficient. - var kid:dsample - for s in dsamples: - if s.s == okid: - kid = s - break - var alt = dsmallest.dist + 1 - if alt < kid.dist: - # TODO: make sure kid with updated dist gets into the heap - kid.dist = alt - previous[okid] = dsmallest.s - - var heapSamples = toSeq(h.items) - - h = newHeap[dsample](proc(a, b: dsample): int = - return a.dist - b.dist - ) - for s in heapSamples: - if s.s.id == okid.id: - h.push(kid) - else: - h.push(s) - return -1 - - -proc relatedness*(a:Sample, b:Sample, samples: seq[Sample]): float64 = - ## coefficient of relatedness of 2 samples given their family. - ## to differentiates siblings from parent-child relationships, - ## siblings are return with a value of 0.49 - ## samples from different families are given a value of -1. - if a.family_id != b.family_id: return -1 - if a.dad == b or a.mom == b: return 0.5 - if b.dad == a or b.mom == a: return 0.5 - - # sibs get a special-case of 0.49 to differentiate from parent-child - if a.dad != nil and a.dad == b.dad and a.mom != nil and a.mom == b.mom: - return 0.49 - - if a notin samples: return -1'f64 - if b notin samples: return -1'f64 - - var lca = lowest_common_ancestors(samples, b, a) - if lca.len == 0: return 0 - var - amax: int = 1000 # hacky use of 1000 and empty value. - bmax: int = 1000 - n = 0 - - for anc in lca: - if anc != a: - var d = dijkstra(samples, anc, a) - if d > 0: - amax = min(amax, d) - if anc != b: - var d = dijkstra(samples, anc, b) - if d > 0: - bmax = min(bmax, dijkstra(samples, anc, b)) - - # if one of the samples is in the set, then their distance is 1. - if a in lca: - amax = 1 - if b in lca: - bmax = 1 - - if amax != 1000: n += 1 else: amax = 0 - if bmax != 1000: n += 1 else: bmax = 0 - - # subtract 2 because the path includes the end sample which we don't need. - return n.float64 * pow(2'f64, -float64(amax + bmax)) + +type sample_path = tuple[sample:Sample, a_path:seq[Sample], b_path:seq[Sample]] + +proc LCA(a:Sample, b:Sample, lcas: var HashSet[sample_path], a_path:seq[Sample], b_path:seq[Sample]) = + + var a_path = a_path + var b_path = b_path + + if a == nil: return + if b == nil: return + + if a == b: + lcas.incl((a, a_path, b_path)) + + block: + var a_path = a_path & a.mom + LCA(a.mom, b, lcas, a_path, b_path) + block: + var a_path = a_path & a.dad + LCA(a.dad, b, lcas, a_path, b_path) + block: + var b_path = b_path & b.mom + LCA(a, b.mom, lcas, a_path, b_path) + block: + var b_path = b_path & b.dad + LCA(a, b.dad, lcas, a_path, b_path) + +proc double_use(a:seq[Sample], b:seq[Sample]): bool {.inline.} = + # a is a chain of samples up to common ancestor + # b is chain of samples up to common ancestor + # if a and b have the same tails, then we have double-use + # and we don't count this chain + # http://genetic-genealogy.co.uk/Toc115570135.html + var ai = a.high + var bi = b.high + doAssert a[ai] == b[bi] + ai -= 1 + bi -= 1 + if ai < 0 or bi < 0: return false + + # a:@[Sample(id:8451, i:-1, affected:false), Sample(id:8448, i:-1, affected:false)] + # b:@[Sample(id:8683, i:-1, affected:false), Sample(id:8451, i:-1, affected:false), Sample(id:8448, i:-1, affected:false)] + + # go backwards through the chains + result = false + while ai > 0 or bi > 0: + #echo "testing:", ai, ",", bi + if a[ai] != b[bi]: + #echo "bailing. ai:", ai, " bi:", bi + return false + return true + #result = true + #ai -= 1 + #bi -= 1 + #if ai <= 0 or bi <= 0: break + + when defined(superdebug): + echo "ai:", ai, " bi:", bi, "dup?", result + +proc is_full_sib_with(a:Sample, b:Sample): bool {.inline.} = + return a.dad != nil and b.dad != nil and a.dad == b.dad and a.mom != nil and b.mom != nil and a.mom == b.mom + +proc relatedness*(a:Sample, b:Sample): float = + if a.family_id != b.family_id: return -1.0 + var r = initHashSet[sample_path](8) + LCA(a, b, r, @[a], @[b]) + if len(r) == 0: return 0.0 + + # now get a seq and sort by shortest totlal path-length + var rs = newSeq[sample_path]() + for x in r: + when defined(superdebug): + echo ">>>>>>>>>>>>>>>>>>>>>>> " + echo "a:", x.a_path + echo "b:", x.b_path + if double_use(x.a_path, x.b_path): + when defined(superdebug): + echo "double" + echo "<<<<<<<<<<<<<<<<<<<<" + continue + rs.add(x) + when defined(superdebug): + echo "not double" + echo "<<<<<<<<<<<<<<<<<<<<" + #echo " " + for r in rs: + result += pow(2'f64, -float64(r.a_path.len - 1 + r.b_path.len - 1))#*pow(1 + prel, 0.5) + + if result == 0.5 and a.is_full_sib_with(b): + result = 0.49 proc parse_ped*(path: string, verbose:bool=true): seq[Sample] = result = new_seq_of_cap[Sample](10) @@ -315,6 +238,7 @@ when isMainModule: suite "pedfile test suite": + #[ test "that samples match those in vcf": var osamples = samples.match(ovcf) for i, s in osamples: @@ -334,20 +258,7 @@ when isMainModule: check sib.dad == samples[0].dad check sib.mom == samples[0].mom - - test "lowest common ancestor": - var k1 = Sample(family_id:"1", id:"kid1") - var k2 = Sample(family_id:"1", id:"kid2") - var mom = Sample(family_id:"1", id:"mom") - var dad = Sample(family_id:"1", id:"dad") - k1.mom = mom - k2.mom = mom - k1.dad = dad - k2.dad = dad - mom.kids.add(@[k1, k2]) - dad.kids.add(@[k1, k2]) - - echo lowest_common_ancestors(@[k1, k2, mom, dad], k1, k2) + ]# test "dijkstra and relatedness": var k1 = Sample(family_id:"1", id:"kid1") @@ -357,7 +268,9 @@ when isMainModule: var uncle = Sample(family_id: "1", id:"uncle") var cousin = Sample(family_id: "1", id:"cousin") var gma = Sample(family_id:"1", id:"gma") + var gpa = Sample(family_id:"1", id:"gma") var ggma = Sample(family_id:"1", id:"ggma") + var ggpa = Sample(family_id:"1", id:"ggpa") var unrel = Sample(family_id:"1", id:"un") var extern = Sample(family_id:"xxx", id:"extern") k1.mom = mom @@ -365,7 +278,9 @@ when isMainModule: k1.dad = dad k2.dad = dad dad.mom = gma + dad.dad = gpa uncle.mom = gma + uncle.dad = gpa uncle.kids.add(cousin) cousin.dad = uncle gma.kids.add(@[dad, uncle]) @@ -373,23 +288,17 @@ when isMainModule: dad.kids.add(@[k1, k2]) ggma.kids.add(gma) gma.mom = ggma - var fam = @[k1, k2, mom, dad, gma, ggma, cousin, uncle, unrel] - check 2 == dijkstra(fam, gma, k1) - check 1 == dijkstra(fam, mom, k1) - check 3 == dijkstra(fam, ggma, k1) - check 1 == dijkstra(fam, ggma, gma) - check 2 == dijkstra(fam, ggma, dad) - check -1 == dijkstra(fam, ggma, mom) - - check relatedness(uncle, k1, fam) == 0.25 - check relatedness(dad, k1, fam) == 0.5 - check relatedness(k1, k2, fam) == 0.49 # ^^^^^ - check relatedness(k2, mom, fam) == 0.5 - check relatedness(k2, gma, fam) == 0.25 - check relatedness(k2, ggma, fam) == 0.125 - check relatedness(extern, gma, fam) == -1'f64 - check relatedness(unrel, gma, fam) == 0.0'f64 - check relatedness(k1, cousin, fam) == 0.125 + gma.dad = ggpa + + check relatedness(uncle, k1) == 0.25 + check relatedness(dad, k1) == 0.5 + check relatedness(k1, k2) == 0.49 # ^^^^^ + check relatedness(k2, mom) == 0.5 + check relatedness(k2, gma) == 0.25 + check relatedness(k2, ggma) == 0.125 + check relatedness(extern, gma) == -1'f64 + check relatedness(unrel, gma) == 0.0'f64 + check relatedness(k1, cousin) == 0.125 test "relatedness": var k1 = Sample(family_id:"1", id:"kid1") @@ -398,14 +307,15 @@ when isMainModule: dad.kids.add(k1) var fam = @[k1, dad] - check 0.5 == relatedness(k1, dad, fam) + check 0.5 == relatedness(k1, dad) test "relatedness with no relations": + echo "xxx" var a = Sample(family_id:"1", id:"a") var b = Sample(family_id:"2", id:"b") - check relatedness(a, b, @[a, b]) == -1'f64 + check relatedness(a, b) == -1'f64 b.family_id = "1" - check relatedness(a, b, @[a, b]) == -0'f64 + check relatedness(a, b) == -0'f64 test "relatedness with 603 ceph samples": @@ -415,7 +325,7 @@ when isMainModule: for i, sampleA in samples[0.. 0.5: echo &"{$sampleA} {$sampleB} {$rel}" diff --git a/tests/distant.ped b/tests/distant.ped new file mode 100644 index 0000000..1253171 --- /dev/null +++ b/tests/distant.ped @@ -0,0 +1,69 @@ +132829 2777 8100 8101 2 0 43.1 2584062824 +132829 2778 8100 8101 2 0 41.0 2583024373 +132829 2779 8100 8101 1 0 37.4 2584483898 +132829 2780 8100 8101 2 0 39.2 2584936032 +132829 2788 8125 8126 1 0 39.9 2582590123 +132829 2946 8107 8108 2 0 69.7 2583438039 +132829 4841 8100 8101 2 0 35.4 2582953452 +132829 51497 8123 8124 2 0 77.6 2582530597 +132829 51498 8123 8124 2 0 71.9 2582018184 +132829 51499 8123 8124 2 0 69.9 2581752037 +132829 51500 8123 8124 2 0 66.8 2582543997 +132829 51501 8123 8124 2 0 63.5 2581970166 +132829 51502 8123 8124 2 0 60.5 2570151857 +132829 51503 8123 8124 2 0 55.2 2570872972 +132829 51504 8123 8124 2 0 52.9 2582352925 +132829 51505 8136 8135 2 0 35.2 2584368648 +132829 51506 8095 8097 1 0 39.3 2583912108 +132829 51507 8095 8097 2 0 37.6 2583318286 +132829 51508 8095 8097 1 0 33.2 2583275665 +132829 58078 8100 8101 2 0 33.5 2584163431 +132829 8090 8095 8097 1 0 51.0 2583793350 +132829 8091 8095 8097 1 0 50.1 2583046457 +132829 8092 8095 8097 2 0 48.7 2582327343 +132829 8093 8095 8097 2 0 47.1 2583768275 +132829 8094 8100 8101 1 0 48.8 2583799717 +132829 8095 8123 8124 1 0 74.3 2581309467 +132829 8096 8095 8097 1 0 46.0 2582709950 +132829 8097 8107 8108 2 0 73.3 2582705619 +132829 8098 8100 8101 2 0 47.2 2583434244 +132829 8099 8100 8101 2 0 45.2 2582795511 +132829 8100 8139 8140 1 0 73.8 2585535677 +132829 8101 8107 8108 2 0 70.7 2582494103 +132829 8102 8107 8108 2 0 62.4 2583444978 +132829 8103 8107 8108 2 0 58.9 2582744518 +132829 8104 8107 8108 2 0 57.3 2581636990 +132829 8105 8107 8108 2 0 54.5 2582114898 +132829 8106 8107 8108 1 0 63.8 2582836817 +132829 8107 0 0 1 0 98.2 0 +132829 8108 0 0 2 0 96.8 0 +132829 8109 8107 8108 2 0 65.4 2583241794 +132829 8110 8107 8108 1 0 61.1 2582457889 +132829 8123 0 0 1 0 111.7 0 +132829 8124 0 0 2 0 100.1 0 +132829 8125 8123 8124 1 0 80.5 2582311992 +132829 8126 8373 8374 2 0 75.6 2582360471 +132829 8127 8125 8126 1 0 45.9 2582553584 +132829 8128 8125 8126 1 0 53.6 2582411909 +132829 8129 8125 8126 1 0 52.2 2583419646 +132829 8130 8136 8135 1 0 53.0 2585273426 +132829 8135 8123 8124 2 0 73.0 2582591961 +132829 8136 8138 8137 1 0 78.2 2583076253 +132829 8137 0 0 2 0 97.9 0 +132829 8138 0 0 1 0 102.0 0 +132829 8139 0 0 1 0 103.1 0 +132829 8140 0 0 2 0 107.4 0 +132829 8141 8142 8143 1 0 48.6 2583641901 +132829 8142 8123 8124 1 0 79.1 2583160870 +132829 8143 8138 8137 2 0 74.5 2581602218 +132829 8144 8125 8126 2 0 48.4 2582278889 +132829 8145 8125 8126 2 0 50.1 2582791794 +132829 8373 0 0 1 0 109.6 0 +132829 8374 0 0 2 0 101.8 0 +132829 8427 8125 8126 2 0 42.1 2582723088 +132829 8428 8125 8126 1 0 43.9 2574713595 +132829 8429 8136 8135 1 0 44.0 2584626204 +132829 8432 8095 8097 1 0 42.7 2584420804 +132829 8433 8095 8097 1 0 41.4 2582388465 +132829 8598 8142 8143 2 0 52.6 2583388559 +132829 8599 8142 8143 2 0 51.2 2583433351 diff --git a/tests/test-ext.ped b/tests/test-ext.ped new file mode 100644 index 0000000..f6b7bf9 --- /dev/null +++ b/tests/test-ext.ped @@ -0,0 +1,8 @@ +1354 8467 8472 8476 2 0 +1354 8479 8472 8476 2 0 +1354 8472 0 8475 1 0 +1354 8473 8472 8476 1 0 +1354 8474 8472 8476 2 0 +1354 8475 0 0 2 0 +1354 8476 8477 8478 2 0 +1354 8478 0 0 2 0 diff --git a/tests/test_pedfile.nim b/tests/test_pedfile.nim new file mode 100644 index 0000000..0134db6 --- /dev/null +++ b/tests/test_pedfile.nim @@ -0,0 +1,167 @@ +import bpbiopkg/pedfile +import unittest + + +var samples = parse_ped("tests/distant.ped") + +proc get_by_id(samples:seq[Sample], sid:string): Sample = + for s in samples: + if s.id == sid: return s + raise newException(KeyError, sid) + +#8125 8130 +# +suite "extended pedigree": + test "double first cousins": + echo "8141 .. 8130" + var a = samples.get_by_id("8141") + var b = samples.get_by_id("8130") + #var a = samples.get_by_id("8109") + #var b = samples.get_by_id("8110") + + check 0.25 == a.relatedness(b) + + test "uncle": + var a = samples.get_by_id("8135") + var b = samples.get_by_id("8141") + + check 0.25 == a.relatedness(b) + + test "sibs": + + var kida = Sample(family_id:"1", id:"kida") + var kidb = Sample(family_id:"1", id:"kidb") + var dad = Sample(family_id:"1", id:"dad") + var mom = Sample(family_id:"1", id:"mom") + var uncle = Sample(family_id:"1", id:"uncle") + + var gma = Sample(family_id:"1", id:"gma") + var gpa = Sample(family_id:"1", id:"gpa") + + var ggma = Sample(family_id:"1", id:"ggma") + var ggpa = Sample(family_id:"1", id:"ggpa") + + dad.kids = @[kida, kidb] + mom.kids = @[kida, kidb] + kida.dad = dad + kidb.dad = dad + kida.mom = mom + kidb.mom = mom + + dad.mom = gma + dad.dad = gpa + uncle.mom = gma + uncle.dad = gpa + + gma.mom = ggma + gma.dad = ggpa + #### + + gma.kids = @[dad, uncle] + gpa.kids = @[dad, uncle] + + var samples = @[kida, kidb, dad, mom, gma, gpa] + + check 0.49 == kida.relatedness(kidb) + check 0.25 == kida.relatedness(uncle) + + # grandparents are full sibs + gpa.mom = ggma + gpa.dad = ggpa + + check 0.5 < kida.relatedness(kidb) + + test "ext pedigree": + var samples = parse_ped("tests/test-ext.ped") + var a = samples.get_by_id("8467") + var b = samples.get_by_id("8472") + #echo samples + check 0.5 == a.relatedness(b) + + test "sibs": + var k1 = Sample(family_id:"1", id:"kid1") + var k2 = Sample(family_id:"1", id:"kid2") + var mom = Sample(family_id:"1", id:"mom") + var dad = Sample(family_id:"1", id:"dad") + var uncle = Sample(family_id: "1", id:"uncle") + var gma = Sample(family_id: "1", id:"gma") + var gpa = Sample(family_id: "1", id:"gpa") + + k1.mom = mom + k2.mom = mom + k1.dad = dad + k2.dad = dad + + mom.mom = gma + mom.dad = gpa + uncle.mom = gma + uncle.dad = gpa + check k1.relatedness(k2) == 0.49 + check k1.relatedness(uncle) == 0.25 + check uncle.relatedness(k1) == 0.25 + check dad.relatedness(k1) == 0.5 + + test "relatedness": + var k1 = Sample(family_id:"1", id:"kid1") + var k2 = Sample(family_id:"1", id:"kid2") + var mom = Sample(family_id:"1", id:"mom") + var dad = Sample(family_id:"1", id:"dad") + var uncle = Sample(family_id: "1", id:"uncle") + var cousin = Sample(family_id: "1", id:"cousin") + var gma = Sample(family_id:"1", id:"gma") + var gpa = Sample(family_id:"1", id:"gma") + var ggma = Sample(family_id:"1", id:"ggma") + var ggpa = Sample(family_id:"1", id:"ggpa") + var unrel = Sample(family_id:"1", id:"un") + var extern = Sample(family_id:"xxx", id:"extern") + k1.mom = mom + k2.mom = mom + k1.dad = dad + k2.dad = dad + dad.mom = gma + dad.dad = gpa + uncle.mom = gma + uncle.dad = gpa + uncle.kids.add(cousin) + cousin.dad = uncle + gma.kids.add(@[dad, uncle]) + mom.kids.add(@[k1, k2]) + dad.kids.add(@[k1, k2]) + ggma.kids.add(gma) + gma.mom = ggma + gma.dad = ggpa + + check relatedness(uncle, k1) == 0.25 + check k1.relatedness(cousin) == 0.125 + + + test "relatedness also": + var k1 = Sample(family_id:"1", id:"kid1") + var k2 = Sample(family_id:"1", id:"kid2") + var mom = Sample(family_id:"1", id:"mom") + var dad = Sample(family_id:"1", id:"dad") + var uncle = Sample(family_id: "1", id:"uncle") + var cousin = Sample(family_id: "1", id:"cousin") + var gma = Sample(family_id:"1", id:"gma") + var gpa = Sample(family_id: "1", id:"gpa") + var ggma = Sample(family_id:"1", id:"ggma") + var unrel = Sample(family_id:"1", id:"un") + var extern = Sample(family_id:"xxx", id:"extern") + k1.mom = mom + k2.mom = mom + k1.dad = dad + k2.dad = dad + dad.mom = gma + dad.dad = gpa + uncle.mom = gma + uncle.dad = gpa + uncle.kids.add(cousin) + cousin.dad = uncle + gma.kids.add(@[dad, uncle]) + gpa.kids.add(@[dad, uncle]) + + mom.kids.add(@[k1, k2]) + dad.kids.add(@[k1, k2]) + ggma.kids.add(gma) + gma.mom = ggma + check relatedness(uncle, k1) == 0.25