-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-index
95 lines (76 loc) · 2.54 KB
/
run-index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env ruby
# encoding: utf-8
# Index all markdown files in a directory
#
# Usage: run-index config.json
#
# Requires OpenAI API Key stored in DOT_OPENAI_KEY
require "json"
require "ostruct"
require "digest"
require_relative "llm/openai"
require_relative "llm/embedding"
require_relative "readers/reader"
if ARGV.length != 1
STDOUT << "Invalid arguments received, need a config file\n"
exit 1
end
config = JSON.parse(File.read(ARGV[0]))
CONFIG = OpenStruct.new(config)
CONFIG.paths = CONFIG.paths.map { |p| OpenStruct.new(p) }
OPENAI_KEY = ENV["DOT_OPENAI_KEY"] || ""
if OPENAI_KEY.empty?
STDOUT << "Remember to set env DOT_OPENAI_KEY\n"
exit 9
end
CONFIG.paths.each do |path|
STDOUT << "Read path name: #{path.name}, reader: #{path.reader}\n"
# Read existing index
STDOUT << "Read existing index: #{path.out}, time: @#{Time.now}\n"
index_db = {}
index_file = File.expand_path(path.out)
File.foreach(index_file) do |line|
item = JSON.parse(line)
index_db[item["hash"]] = item
end if File.exist?(index_file)
STDOUT << "Found index: #{index_db.length}\n"
# Scan directory
name_match = path.nameMatch || "*.{md,markdown}"
dir_blob = File.join(File.expand_path(path.dir), "**", name_match)
files = Dir[dir_blob]
STDOUT << "Scan dir: #{dir_blob}, Found: #{files.length}\n"
# Get reader
Reader = get_reader(path.reader)
if Reader.nil?
STDOUT << "Reader undefinied: #{path.reader}\n"
exit 9
end
# Build index
STDOUT << "Building index @#{Time.now}\n["
skipped = 0
created = 0
File.open(index_file, "w") do |index_newdb|
files.each_with_index do |file, file_idx|
chunks = Reader.new(file).load.chunks
chunks.each_with_index do |chunk, chunk_idx|
hash = Digest::SHA256.hexdigest(chunk)
if index_db[hash] # found in old DB
index_newdb.puts(index_db[hash].to_json)
skipped += 1
next
end
created += 1
embedding = embedding(chunk)
line = { path: file, hash: hash, chunk: chunk_idx, embedding: embedding }
index_newdb.puts(line.to_json)
end
if file_idx % 50 == 0 # flush the file writes
index_newdb.flush
STDOUT << file_idx
else
STDOUT << "."
end
end
end
STDOUT << "]\nDone @#{Time.now}, Created: #{created}, Skipped: #{skipped}\n"
end