-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrepfile.cr
319 lines (291 loc) · 12.8 KB
/
grepfile.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
require "admiral"
require "compress/gzip"
class GrepFile < Admiral::Command
define_argument target,
description: "target file, support flat or .gz file or stdin(-)",
required: true
define_argument query,
description: "query file, support flat or .gz file or stdin(-)",
required: true
define_flag column_target : String,
default: "1",
description: "choose which column to compare for target file, allow multil columns as keyword (example 1,3,5)"
define_flag column_query : String,
default: "1",
description: "choose which column to compare for query file, allow multil columns as keyword (example 1,3,5)"
define_flag print_header : Int32,
default: 0,
description: "1 mean output header, 0 mean not output header"
define_flag column_name_target : String,
default: "",
description: "use with --header-target, and not allow with --column_target together! choose which column to compare for target file, allow multil columns as keyword (example key1,key2,key3)"
define_flag column_name_query : String,
default: "",
description: "use with --header-query, and not allow with --column_query together! choose which column to compare for target file, allow multil columns as keyword (example key1,key2,key3)"
define_flag header_query : Int32,
default: 1,
description: "set which one line is the header of query file"
define_flag header_target : Int32,
default: 1,
description: "set which one line is the header of target file"
define_flag sort_output_by_query : Int32,
default: 0_i32,
description: "sort ouput by query column order"
define_flag ignore_line_mathed_by : String,
default: "^[#@]",
description: "if content of column start with # or @, will skip this line, support regex syntax"
define_flag ignore_case : Int32,
default: 0_i32,
description: "if set to 1 mean will ignore case for query and target match, default 0"
define_flag delete_chars_from_column : String,
default: "^>",
description: "delete > from content of column, support regex syntax"
define_flag invert_match : Int32,
default: 0_i32,
description: "if >=1, mean invert the sense of matching, to select non-matching lines"
define_flag sep_query : String,
default: "\t",
description: "query separator, '\\t' or '\\s' or other string"
define_flag exact_match : Int32,
default: 1_i32,
description: "if >=1, mean equal totally else mean macth"
define_flag sep_target : String,
default: "\t",
description: "target separator, '\\t' or '\\s' or other string"
define_help description: "A replace for $ grep -f (which cost too many memory and time) in Linux"
define_version "1.0.4"
COMPILE_TIME = Time.local
def run
if ARGV.size == 0 || ARGV.size == 1
# puts "complie time: #{COMPILE_TIME}"
# app = __FILE__.gsub(/\.cr$/, "")
# puts `#{app} --help`
# exit 1
puts "Contact: https://github.com/orangeSi/grepfile/issues"
GrepFile.run "--help"
end
query_ids = {} of String => String
ignore_line_mathed_by = flags.ignore_line_mathed_by
target = ARGV[0]
query = ARGV[1]
if (target == "stdin" || target == "-") && ( query == "stdin" || query == "-")
raise "error: target and query should not both be stdin, only one or zero is stdin!"
end
# check --column_name_target and --column_target is not both used!
column_target = flags.column_target.strip(",")
column_query = flags.column_query.strip(",")
if flags.column_name_target != "" && flags.column_target != ""
column_target = ""
#raise "error: only choose one parameter from --column-name-target or --column-target !"
end
if flags.column_name_query != "" && flags.column_query != ""
column_query = ""
#raise "error: only choose one parameter from --column-name-query or --column-query !"
end
# read query file
# puts "arguments.query is #{arguments.query}"
line_index = 0
if query == "stdin" || query == "-"
STDIN.each_line do |line|
line_index += 1
# get column number by column name
if flags.column_name_query != "" && line_index == flags.header_query
column_query = get_column_number_by_name(line: line, colname: flags.column_name_query, sep: flags.sep_query)
next
end
query_ids = read_query_file(line, column_query, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_query: flags.sep_query, query: "stdin", delete_chars_from_column: flags.delete_chars_from_column, ignore_case: flags.ignore_case)
end
elsif query.match(/.*\.gz$/) # gzip file
Compress::Gzip::Reader.open(query) do |gfile|
gfile.each_line do |line|
line_index += 1
# get column number by column name
if flags.column_name_query != "" && line_index == flags.header_query
column_query = get_column_number_by_name(line: line, colname: flags.column_name_query, sep: flags.sep_query)
next
end
query_ids = read_query_file(line, column_query, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_query: flags.sep_query, query: query, delete_chars_from_column: flags.delete_chars_from_column, ignore_case: flags.ignore_case)
end
end
else # not gzip file
File.each_line(query) do |line|
line_index += 1
# get column number by column name
if flags.column_name_query != "" && line_index == flags.header_query
column_query = get_column_number_by_name(line: line, colname: flags.column_name_query, sep: flags.sep_query)
next
end
query_ids = read_query_file(line, column_query, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_query: flags.sep_query, query: query, delete_chars_from_column: flags.delete_chars_from_column, ignore_case: flags.ignore_case)
end
end
# # read target file
target_ids = {} of String => String
target_ids_num = 0
sort_output_by_query_flag = (flags.sort_output_by_query >= 1)
sorted_output = {} of (Bool|String) => String
line_index = 0
if target == "stdin" || target == "-"
STDIN.each_line do |line|
line_index += 1
# get column number by column name
if flags.print_header >= 1 && flags.header_target == line_index
puts line
end
if flags.column_name_target != "" && line_index == flags.header_target
column_target = get_column_number_by_name(line: line, colname: flags.column_name_target, sep: flags.sep_target)
next
end
output_flag = read_target_file(line, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_target: flags.sep_target, target: "target", column_target: column_target, delete_chars_from_column: flags.delete_chars_from_column, invert_match: flags.invert_match, exact_match: flags.exact_match, ignore_case: flags.ignore_case, sort_output_by_query_flag: sort_output_by_query_flag)
if output_flag != ""
sorted_output[output_flag] = line
end
end
elsif target.match(/.*\.gz$/) # gzip file
Compress::Gzip::Reader.open(target) do |gfile|
gfile.each_line do |line|
line_index += 1
# get column number by column name
if flags.print_header >= 1 && flags.header_target == line_index
puts line
end
if flags.column_name_target != "" && line_index == flags.header_target
column_target = get_column_number_by_name(line: line, colname: flags.column_name_target, sep: flags.sep_target)
next
end
output_flag = read_target_file(line, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_target: flags.sep_target, target: target, column_target: column_target, delete_chars_from_column: flags.delete_chars_from_column, invert_match: flags.invert_match, exact_match: flags.exact_match, ignore_case: flags.ignore_case, sort_output_by_query_flag: sort_output_by_query_flag)
if output_flag != ""
sorted_output[output_flag] = line
end
end
end
else # not gzip file
File.each_line(target) do |line|
line_index += 1
# get column number by column name
if flags.print_header >= 1 && flags.header_target == line_index
puts line
end
if flags.column_name_target != "" && line_index == flags.header_target
column_target = get_column_number_by_name(line: line, colname: flags.column_name_target, sep: flags.sep_target)
next
end
output_flag = read_target_file(line, query_ids, ignore_line_mathed_by: ignore_line_mathed_by, sep_target: flags.sep_target, target: target, column_target: column_target, delete_chars_from_column: flags.delete_chars_from_column, invert_match: flags.invert_match, exact_match: flags.exact_match, ignore_case: flags.ignore_case, sort_output_by_query_flag: sort_output_by_query_flag)
if output_flag != ""
sorted_output[output_flag] = line
end
end
end
if sort_output_by_query_flag
query_ids.each_key do |k|
puts sorted_output[k] if sorted_output.has_key?(k)
end
end
end
def get_column_number_by_name(line : String, colname : String, sep : String)
# get column number by column name
column = ""
arr = line.split(/#{sep}/)
colname.strip(",").split(/,/).each do |e1|
arr.each_with_index do |e2, i2|
if e1 == e2
column = "#{column},#{i2+1}"
#puts "gets column=#{column}"
break
end
end
end
column = column.strip(",")
if column.split(/,/).size != colname.strip(",").split(/,/).size
raise "error: column name #{colname} are not in line:#{line}\n"
end
#puts "column=#{column}"
return column
end
def read_target_file(line : String, query_ids : Hash(String, String), ignore_line_mathed_by : String = "^#", sep_target : String = "\t", target : String = "target", column_target : String = "1", delete_chars_from_column : String = "^>", invert_match : Int32 = 0, exact_match : Int32 = 0, ignore_case : Int32 = 0, sort_output_by_query_flag : Bool = false)
return false if ignore_line_mathed_by != "" && line.match(/#{ignore_line_mathed_by}/)
return false if line.match(/^\s*$/)
arr = line.split(/#{sep_target}/)
id = ""
#puts "column_target=#{column_target}"
column_target.split(",").each do |tcol|
tcol = tcol.to_i
raise "error: #{target} only have #{arr.size} column in line #{line}, but tcol=#{tcol} in --column_target #{column_target}, try to change --sep_target for line: #{arr}\n" if tcol > arr.size
tcol_id = arr[tcol - 1]
tcol_id = tcol_id.gsub(/#{delete_chars_from_column}/, "") if delete_chars_from_column != ""
if id != ""
id = "#{id}_#{tcol_id}"
else
id = tcol_id
end
end
output_flag = ""
id = id.upcase if ignore_case > 0
#puts "target id = #{id}"
if invert_match == 0
if exact_match >= 1
if query_ids.has_key?(id)
if sort_output_by_query_flag
output_flag = id
else
puts "#{line}"
end
end
else
query_ids.each_key do |k|
if id =~ /#{k}/
if sort_output_by_query_flag
output_flag = k
else
puts "#{line}"
end
break
end
end
end
else # flags.invert_match >=1
if exact_match >= 1
if !query_ids.has_key?(id)
puts "#{line}"
end
else
# raise "error: --invert_match #{flags.invert_match} not support --exact_match=#{flags.exact_match}"
matched_flag = 0
query_ids.each_key do |k|
if id =~ /#{k}/
matched_flag = 1
break
end
end
if matched_flag == 0
puts "#{line}"
end
end
end
return output_flag
end
def read_query_file(line : String, column_query : String, query_ids : Hash(String, String), ignore_line_mathed_by : String = "", sep_query : String = "\t", query : String = "", delete_chars_from_column : String = "", ignore_case : Int32 = 0)
return query_ids if ignore_line_mathed_by != "" && line.match(/#{ignore_line_mathed_by}/)
return query_ids if line.match(/^\s*$/)
arr = line.split(/#{sep_query}/)
#puts "column_query=#{column_query}"
id = ""
column_query.split(",").each do |tcol|
tcol = tcol.to_i
raise "error: query #{query} only have #{arr.size} columns, but --column_query #{column_query}, try to change --sep_query for line: #{arr}\n" if tcol > arr.size
tcol_id = arr[tcol - 1]
tcol_id = tcol_id.gsub(/#{delete_chars_from_column}/, "") if delete_chars_from_column != ""
if id != ""
id = "#{id}_#{tcol_id}"
else
id = tcol_id
end
end
id = id.upcase if ignore_case > 0
#puts "query id = #{id}"
unless query_ids.has_key?(id)
query_ids[id] = ""
end
return query_ids
end
end
GrepFile.run