-
Notifications
You must be signed in to change notification settings - Fork 201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add regex stuff #529
Add regex stuff #529
Changes from 7 commits
4a413d0
f32bb95
80adac8
786dee6
2b0b086
6857c6e
99fd457
8ed0493
c55bca1
99d2218
1454962
3ac21b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ Copyright (c) 2023 Uber Technologies, Inc. | |
use crate::{ | ||
models::Validator, | ||
utilities::{ | ||
regex_utilities::get_all_matches_for_regex, | ||
tree_sitter_utilities::{get_all_matches_for_query, get_ts_query_parser, number_of_errors}, | ||
Instantiate, | ||
}, | ||
|
@@ -38,12 +39,20 @@ impl CGPattern { | |
pub(crate) fn pattern(&self) -> String { | ||
self.0.to_string() | ||
} | ||
|
||
pub(crate) fn extract_regex(&self) -> String { | ||
let mut _val = &self.pattern()[4..]; | ||
_val.to_string() | ||
} | ||
} | ||
|
||
impl Validator for CGPattern { | ||
fn validate(&self) -> Result<(), String> { | ||
if self.pattern().starts_with("rgx ") { | ||
panic!("Regex not supported") | ||
let mut _val = &self.pattern()[4..]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we have this logic here and in |
||
return Regex::new(_val) | ||
.map(|_| Ok(())) | ||
.unwrap_or(Err(format!("Cannot parse the regex - {_val}"))); | ||
} | ||
let mut parser = get_ts_query_parser(); | ||
parser | ||
|
@@ -109,7 +118,9 @@ impl CompiledCGPattern { | |
replace_node, | ||
replace_node_idx, | ||
), | ||
CompiledCGPattern::R(_) => panic!("Regex is not yet supported!!!"), | ||
CompiledCGPattern::R(regex) => { | ||
get_all_matches_for_regex(node, source_code, regex, recursive, replace_node) | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,6 +55,18 @@ pub(crate) struct Match { | |
gen_py_str_methods!(Match); | ||
|
||
impl Match { | ||
pub(crate) fn from_regex( | ||
mtch: ®ex::Match, matches: HashMap<String, String>, source_code: &str, | ||
) -> Self { | ||
Match { | ||
matched_string: mtch.as_str().to_string(), | ||
range: Range::from_regex_match(mtch, source_code), | ||
matches, | ||
associated_comma: None, | ||
associated_comments: Vec::new(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we populate associated comments later and does that work for regex matches? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not blocking, but still curious about this. |
||
} | ||
} | ||
|
||
pub(crate) fn new( | ||
matched_string: String, range: tree_sitter::Range, matches: HashMap<String, String>, | ||
) -> Self { | ||
|
@@ -231,7 +243,7 @@ impl Match { | |
serde_derive::Serialize, Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Deserialize, | ||
)] | ||
#[pyclass] | ||
struct Range { | ||
pub(crate) struct Range { | ||
#[pyo3(get)] | ||
start_byte: usize, | ||
#[pyo3(get)] | ||
|
@@ -260,6 +272,32 @@ impl From<tree_sitter::Range> for Range { | |
} | ||
gen_py_str_methods!(Range); | ||
|
||
impl Range { | ||
pub(crate) fn from_regex_match(mtch: ®ex::Match, source_code: &str) -> Self { | ||
Self { | ||
start_byte: mtch.start(), | ||
end_byte: mtch.end(), | ||
start_point: position_for_offset(source_code.as_bytes(), mtch.start()), | ||
end_point: position_for_offset(source_code.as_bytes(), mtch.end()), | ||
} | ||
} | ||
} | ||
|
||
// Finds the position (col and row number) for a given offset. | ||
// Copied from tree-sitter tests [https://github.com/tree-sitter/tree-sitter/blob/d0029a15273e526925a764033e9b7f18f96a7ce5/cli/src/parse.rs#L364] | ||
fn position_for_offset(input: &[u8], offset: usize) -> Point { | ||
let mut result = Point { row: 0, column: 0 }; | ||
for c in &input[0..offset] { | ||
if *c as char == '\n' { | ||
result.row += 1; | ||
result.column = 0; | ||
} else { | ||
result.column += 1; | ||
} | ||
} | ||
result | ||
} | ||
|
||
/// A range of positions in a multi-line text document, both in terms of bytes and of | ||
/// rows and columns. | ||
#[derive( | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,70 @@ | ||||||
/* | ||||||
Copyright (c) 2023 Uber Technologies, Inc. | ||||||
<p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file | ||||||
except in compliance with the License. You may obtain a copy of the License at | ||||||
<p>http://www.apache.org/licenses/LICENSE-2.0 | ||||||
<p>Unless required by applicable law or agreed to in writing, software distributed under the | ||||||
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | ||||||
express or implied. See the License for the specific language governing permissions and | ||||||
limitations under the License. | ||||||
*/ | ||||||
|
||||||
use crate::models::matches::Match; | ||||||
use itertools::Itertools; | ||||||
use regex::Regex; | ||||||
use std::collections::HashMap; | ||||||
use tree_sitter::Node; | ||||||
|
||||||
/// Applies the query upon the given `node`, and gets all the matches | ||||||
/// # Arguments | ||||||
/// * `node` - the root node to apply the query upon | ||||||
/// * `source_code` - the corresponding source code string for the node. | ||||||
/// * `recursive` - if `true` it matches the query to `self` and `self`'s sub-ASTs, else it matches the `query` only to `self`. | ||||||
/// * `replace_node` - node to replace | ||||||
/// | ||||||
/// # Returns | ||||||
/// The range of the match in the source code and the corresponding mapping from tags to code snippets. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this correct? Return seems to just be a vector of matches. Where is this mapping encoded? |
||||||
pub(crate) fn get_all_matches_for_regex( | ||||||
node: &Node, source_code: String, regex: &Regex, recursive: bool, replace_node: Option<String>, | ||||||
) -> Vec<Match> { | ||||||
// let code_snippet = node.utf8_text(source_code.as_bytes()).unwrap(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete commented out line |
||||||
let all_captures = regex.captures_iter(&source_code).collect_vec(); | ||||||
let names = regex.capture_names().collect_vec(); | ||||||
let mut all_matches = vec![]; | ||||||
for captures in all_captures { | ||||||
// Check if the range of the self (node), and the range of outermost node captured by the query are equal. | ||||||
let range_matches_node = node.start_byte() == captures.get(0).unwrap().start() | ||||||
&& node.end_byte() == captures.get(0).unwrap().end(); | ||||||
let range_matches_inside_node = node.start_byte() <= captures.get(0).unwrap().start() | ||||||
&& node.end_byte() >= captures.get(0).unwrap().end(); | ||||||
if (recursive && range_matches_inside_node) || range_matches_node { | ||||||
let group_by_tag = if let Some(ref rn) = replace_node { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Group by tag is either the matched code string (plus location info) corresponding to the match group for the replace node if present or a the matched code string matching the full regex match if not, correct? Why is the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! your understanding is correct. It represents the |
||||||
captures | ||||||
.name(rn) | ||||||
.unwrap_or_else(|| panic!("the tag {rn} provided in the replace node is not present")) | ||||||
} else { | ||||||
captures.get(0).unwrap() | ||||||
}; | ||||||
let matches = extract_captures(&captures, &names); | ||||||
all_matches.push(Match::from_regex(&group_by_tag, matches, &source_code)); | ||||||
} | ||||||
} | ||||||
all_matches | ||||||
} | ||||||
|
||||||
// Creates an hashmap from the capture group(name) to the corresponding code snippet. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
fn extract_captures( | ||||||
captures: ®ex::Captures<'_>, names: &Vec<Option<&str>>, | ||||||
) -> HashMap<String, String> { | ||||||
names | ||||||
.iter() | ||||||
.flatten() | ||||||
.flat_map(|x| { | ||||||
captures | ||||||
.name(x) | ||||||
.map(|v| (x.to_string(), v.as_str().to_string())) | ||||||
}) | ||||||
.collect() | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright (c) 2023 Uber Technologies, Inc. | ||
# | ||
# <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file | ||
# except in compliance with the License. You may obtain a copy of the License at | ||
# <p>http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# <p>Unless required by applicable law or agreed to in writing, software distributed under the | ||
# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | ||
# express or implied. See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
[[edges]] | ||
scope = "File" | ||
from = "update_import" | ||
to = ["update_list_int"] | ||
|
||
[[edges]] | ||
scope = "Method" | ||
from = "update_list_int" | ||
to = ["update_add"] |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,59 @@ | ||||||
# Copyright (c) 2023 Uber Technologies, Inc. | ||||||
# | ||||||
# <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file | ||||||
# except in compliance with the License. You may obtain a copy of the License at | ||||||
# <p>http://www.apache.org/licenses/LICENSE-2.0 | ||||||
# | ||||||
# <p>Unless required by applicable law or agreed to in writing, software distributed under the | ||||||
# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | ||||||
# express or implied. See the License for the specific language governing permissions and | ||||||
# limitations under the License. | ||||||
|
||||||
# Replace foo().bar().baz() with `true` inside methods not nnotated as @DoNotCleanup | ||||||
[[rules]] | ||||||
name = "replace_call" | ||||||
query = """rgx (?P<n1>foo\\(\\)\\.bar\\(\\)\\.baz\\(\\))""" | ||||||
replace_node = "n1" | ||||||
replace = "true" | ||||||
groups = ["replace_expression_with_boolean_literal"] | ||||||
[[rules.filters]] | ||||||
enclosing_node = """(method_declaration) @md""" | ||||||
not_contains = ["""rgx @DoNotCleanup"""] | ||||||
|
||||||
# Before: | ||||||
# abc().def().ghi() | ||||||
# abc().fed().ghi() | ||||||
[[rules]] | ||||||
name = "replace_call_def_fed" | ||||||
query = """rgx (?P<n>abc\\(\\)\\.(?P<m_def>def)\\(\\)\\.ghi\\(\\))""" | ||||||
replace_node = "m_def" | ||||||
replace = "fed" | ||||||
|
||||||
|
||||||
# The below three rules do a dummy type migration from List<Integer> to NewList | ||||||
|
||||||
# Updates the import statement from `java.util.List` to `com.uber.NEwList` | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
[[rules]] | ||||||
name = "update_import" | ||||||
query = """rgx (?P<n>java\\.util\\.List)""" | ||||||
replace_node = "n" | ||||||
replace = "com.uber.NewList" | ||||||
|
||||||
# Updates the type of local variables from `List<Integer>` to `com.uber.NewList` | ||||||
[[rules]] | ||||||
name = "update_list_int" | ||||||
query = """rgx (?P<var_decl>(?P<type>List<Integer>)\\s*(?P<name>\\w+)\\s*=.*;)""" | ||||||
replace_node = "type" | ||||||
replace = "NewList" | ||||||
is_seed_rule = false | ||||||
[[rules.filter]] | ||||||
enclosing_node = "(method_declaration) @cmd" | ||||||
|
||||||
# Updates the relevant callsite from `add` to `addToNewList` | ||||||
[[rules]] | ||||||
name = "update_add" | ||||||
query = """rgx (?P<call>@name\\.(?P<m_name>add)\\(\\w+\\))""" | ||||||
replace_node = "m_name" | ||||||
replace = "addToNewList" | ||||||
holes = ["name"] | ||||||
is_seed_rule = false | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Every Or is something like that not meant to be supported? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats a brilliant idea ! We can absolutely do that! Done ✅ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package com.uber.piranha; | ||
|
||
import com.uber.NewList; | ||
|
||
class A { | ||
|
||
void foobar() { | ||
System.out.println("Hello World!"); | ||
System.out.println(true); | ||
} | ||
|
||
@DoNotCleanup | ||
void barfn() { | ||
boolean b = foo().bar().baz(); | ||
System.out.println(b); | ||
} | ||
|
||
void foofn() { | ||
int total = abc().fed().ghi(); | ||
} | ||
|
||
void someTypeChange() { | ||
// Will get updated | ||
NewList a = getList(); | ||
Integer item = getItem(); | ||
a.addToNewList(item); | ||
|
||
// Will not get updated | ||
List<String> b = getListStr(); | ||
Integer item = getItemStr(); | ||
b.add(item); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,37 @@ | ||||||
package com.uber.piranha; | ||||||
|
||||||
import java.util.List; | ||||||
|
||||||
class A { | ||||||
|
||||||
void foobar() { | ||||||
boolean b = foo().bar().baz(); | ||||||
if (b) { | ||||||
System.out.println("Hello World!"); | ||||||
} | ||||||
System.out.println(b); | ||||||
} | ||||||
|
||||||
@DoNotCleanup | ||||||
void barfn() { | ||||||
boolean b = foo().bar().baz(); | ||||||
System.out.println(b); | ||||||
} | ||||||
|
||||||
void foofn() { | ||||||
int total = abc().def().ghi(); | ||||||
} | ||||||
|
||||||
void someTypeChange() { | ||||||
// Will get updated | ||||||
List<Integer> a = getList(); | ||||||
Integer item = getItem(); | ||||||
a.add(item); | ||||||
|
||||||
// Will not get updated | ||||||
List<String> b = getListStr(); | ||||||
Integer item = getItemStr(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Or something like that. I know this code doesn't get compiled, but it still should make sense types-wise :) |
||||||
b.add(item); | ||||||
} | ||||||
|
||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.