-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNumberDocumentCreator.cs
66 lines (54 loc) · 2.36 KB
/
NumberDocumentCreator.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
//*********************************************************************************************************
//NumberDocumentCreator - generate collections of random number tokens for minhashing.
//*********************************************************************************************************
using System;
using System.Collections.Generic;
using System.Linq;
namespace LinstaMatch
{
/// <summary>
/// This class creates collections of random numbers which are intended to simulate words / tokens parsed from a document.
/// </summary>
class NumberDocumentCreator
{
public int documentMaxTokens;
public int documentCount;
public Dictionary<int, int[]> documentCollection = new Dictionary<int, int[]>();
public Dictionary<int, List<int>> documentCollectionList = new Dictionary<int, List<int>>();
public NumberDocumentCreator(int DocumentsToCreate, int DocumentMaxTokens)
{
documentMaxTokens = DocumentMaxTokens;
documentCount = DocumentsToCreate;
fillDocumentCollection(DocumentsToCreate);
}
public void fillDocumentCollection(int documentCount)
{
for (int i = 1; i <= documentCount; i++)
{ //select a random number between 25% and 100% of the documentMaxTokens (simulate documents of different sizes)
int[] doc = createDocument(documentMaxTokens);
documentCollection.Add(i, doc);
documentCollectionList.Add(i, doc.ToList());
}
}
private Random r = new Random();
public int[] createDocument(int documentMaxTokens)
{
int minTokens = (int)(documentMaxTokens * 0.25);
int tokenCount = r.Next(minTokens, documentMaxTokens);
int[] tokens = new int[tokenCount];
//create random tokens for our document
for (int i = 0; i < tokenCount; i++)
{
int token = r.Next(0,documentMaxTokens);
tokens[i] = token;
}
return tokens;
}
public static double calculateJaccard(int[] setA, int[] setB)
{
double intersection = setA.Intersect(setB).Count();
double union = setA.Union(setB).Count();
return (double)intersection / (double)union;
}
}
}