forked from github-linguist/linguist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheuristics.rb
171 lines (144 loc) · 4.19 KB
/
heuristics.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
require 'yaml'
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
HEURISTICS_CONSIDER_BYTES = 50 * 1024
# Public: Use heuristics to detect language of the blob.
#
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Examples
#
# Heuristics.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, candidates)
return [] if blob.symlink?
self.load()
data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
@heuristics.each do |heuristic|
if heuristic.matches?(blob.name, candidates)
return Array(heuristic.call(data))
end
end
[] # No heuristics matched
rescue Regexp::TimeoutError
[] # Return nothing if we have a bad regexp which leads to a timeout enforced by Regexp.timeout in Ruby 3.2 or later
end
# Public: Get all heuristic definitions
#
# Returns an Array of heuristic objects.
def self.all
self.load()
@heuristics
end
# Internal: Load heuristics from 'heuristics.yml'.
def self.load()
if @heuristics.any?
return
end
data = self.load_config
named_patterns = data['named_patterns'].map { |k,v| [k, self.to_regex(v)] }.to_h
data['disambiguations'].each do |disambiguation|
exts = disambiguation['extensions']
rules = disambiguation['rules']
rules.map! do |rule|
rule['pattern'] = self.parse_rule(named_patterns, rule)
rule
end
@heuristics << new(exts, rules)
end
end
def self.load_config
YAML.load_file(File.expand_path("../heuristics.yml", __FILE__))
end
def self.parse_rule(named_patterns, rule)
if !rule['and'].nil?
rules = rule['and'].map { |block| self.parse_rule(named_patterns, block) }
return And.new(rules)
elsif !rule['pattern'].nil?
return self.to_regex(rule['pattern'])
elsif !rule['negative_pattern'].nil?
pat = self.to_regex(rule['negative_pattern'])
return NegativePattern.new(pat)
elsif !rule['named_pattern'].nil?
return named_patterns[rule['named_pattern']]
else
return AlwaysMatch.new()
end
end
# Internal: Converts a string or array of strings to regexp
#
# str: string or array of strings. If it is an array of strings,
# Regexp.union will be used.
def self.to_regex(str)
if str.kind_of?(Array)
Regexp.union(str.map { |s| Regexp.new(s) })
else
Regexp.new(str)
end
end
# Internal: Array of defined heuristics
@heuristics = []
# Internal
def initialize(exts, rules)
@exts = exts
@rules = rules
end
# Internal: Return the heuristic's target extensions
def extensions
@exts
end
# Internal: Return the heuristic's candidate languages
def languages
@rules.map do |rule|
[rule['language']].flatten(2).map { |name| Language[name] }
end.flatten.uniq
end
# Internal: Check if this heuristic matches the candidate filenames or
# languages.
def matches?(filename, candidates)
filename = filename.downcase
candidates = candidates.compact.map(&:name)
@exts.any? { |ext| filename.end_with?(ext) }
end
# Internal: Perform the heuristic
def call(data)
matched = @rules.find do |rule|
rule['pattern'].match?(data)
end
if !matched.nil?
languages = matched['language']
if languages.is_a?(Array)
languages.map{ |l| Language[l] }
else
Language[languages]
end
end
end
end
class And
def initialize(pats)
@pats = pats
end
def match?(input)
return @pats.all? { |pat| pat.match?(input) }
end
end
class AlwaysMatch
def match?(input)
return true
end
end
class NegativePattern
def initialize(pat)
@pat = pat
end
def match?(input)
return [email protected]?(input)
end
end
end