vulnerability_detection.py

# Copyright (C) 2021 Aurore Fass
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
    Looking for suspicious data flows: is there a data flow between a security- or privacy-critical
    API and an attacker (i.e., web application or another extension)?
"""

import os
import logging
import timeit
import json
import re

import pdg_js.node as _node
import pdg_js.utility_df as utility_df

import check_permissions
from get_pdg import get_node_computed_value_e
from extension_communication import build_extension_pdg
import danger_analysis
import wa_communication
import utility


PRINT_DEBUG = utility.PRINT_DEBUG

SRC_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__)))
DOUBLEX_APIS_PATH = os.path.join(SRC_PATH, 'suspicious_apis', 'doublex_apis.json')  # DoubleX APIs
EMPOWEB_APIS_PATH = os.path.join(SRC_PATH, 'suspicious_apis', 'empoweb_apis.json')  # EmPoWeb APIs


"""
In the following, if not stated otherwise:
    - node: Node
        Current node.
    - whoami: str
        Where I am: 'cs' or 'bp'.
    - res_dict: dict
        To store the vulnerability analysis results.
    - chrome: bool
        True if we are handling a chrome-based extension, False otherwise.
    - direct_danger_dict: dict
        Storing the analysis of dangerous APIs that can directly do bad things with attacker-
        controllable data.
    - danger_dict: dict
        Storing the analysis of dangerous APIs that 1) do bad things with attacker-controllable data
        and 2) need to send some results back to an attacker.
    - exfiltration_dict
        Storing the analysis of dangerous APIs that directly exfiltrate data to an attacker.
"""


def check_prov_danger(danger_node, to_check_node):
    """ Checks if danger_node is coming from to_check_node.
    If it is, returns True, value-from-where-it-comes-from. Otherwise: False, None. """

    found, where = analyze_danger(danger_node, to_check_node)  # Direct check
    if found:
        return True, where

    if isinstance(danger_node, _node.Value):
        for danger_provenance in danger_node.provenance_parents:  # Otherwise, check provenance
            found, where = analyze_danger(danger_provenance, to_check_node)
            if found:
                return True, where

    for child in danger_node.children:
        # Recursion on children and not provenance as provenance already stores the origin
        found, where = check_prov_danger(child, to_check_node)
        if found:
            return True, where

    return False, None


def analyze_danger(danger_node, to_check_node):
    """ Checks if danger_node is directly coming from to_check_node and returns its value."""

    if isinstance(to_check_node, _node.Node):
        if danger_node == to_check_node:  # Comparison at the node level
            danger_node_value = get_node_computed_value_e(danger_node)
            danger_node_value_str = str(danger_node_value)
            return True, danger_node_value_str

    return False, None


def look_for_vulnerabilities(node, whoami, sinks, dangers):
    """ Analysis of a PDG to detect dangerous sinks, stored in dangers. """

    for child in node.children:
        if child.name in ('CallExpression', 'TaggedTemplateExpression'):
            if len(child.children) > 0 and child.children[0].body in ('callee', 'tag'):
                callee = child.children[0]
                call_expr_value = get_node_computed_value_e(callee)
                call_expr_value_all = get_node_computed_value_e(child)
                child.set_value(call_expr_value_all)
                if isinstance(call_expr_value, str):  # No need to check if it is not a str
                    flagged_sink, sink = danger_analysis.check_dangerous_sinks(child,
                                                                               call_expr_value,
                                                                               sinks)
                    if flagged_sink:  # Dangerous sink used
                        danger_analysis.add_danger(where=dangers, api_name=sink, api_node=child,
                                                   api_value=call_expr_value_all,
                                                   params=child.children[1:])

                elif isinstance(call_expr_value_all, str):  # Special case for asynchronous XHR
                    # {'XMLHttpRequest()': {}, 'onreadystatechange': <node.FunctionExpr}.open(...)
                    flagged_xhr, sink = danger_analysis.check_async_xhr(child, call_expr_value_all,
                                                                        sinks)
                    if flagged_xhr:  # Dangerous sink used
                        danger_analysis.add_danger(where=dangers, api_name=sink, api_node=child,
                                                   api_value=call_expr_value_all,
                                                   params=child.children[1:])

        look_for_vulnerabilities(child, whoami=whoami, sinks=sinks, dangers=dangers)


def fill_vulnerability_dict(my_dict, wa, wa_node, where):
    """ Fills the dictionary my_dict containing the vulnerability results. """

    my_dict['wa'] = wa  # API to communicate with the web app
    my_dict['line'] = wa_node.get_line()  # Line where the previous API was detected
    my_dict['filename'] = wa_node.get_file()  # Corresponding file (CS vs. BP)
    my_dict['where'] = where  # Context, value of the node leading to the vulnerability


def check_data_exfiltration(danger_id_dict, with_wa, sensitive_api):
    """ Checks if the combination dangerous sink + sensitive API is then sent to the web app. """

    # If we are here, it means that a sensitive API flew into a dangerous sink.
    # Here, we have specifically access to this sensitive API and this dangerous sink.
    # Here, we would like to know if this dangerous sink is now being transmitted to the web app.
    # Therefore, we check whether this sensitive API is being transmitted to the web app...

    to_wa_id = 0
    for sent_to_wa in with_wa.sent_list:
        sent_to_wa_value = get_node_computed_value_e(sent_to_wa)
        vulnerable, where = check_prov_danger(sent_to_wa, sensitive_api)
        # sensitive_api_value = get_node_computed_value_e(sensitive_api)
        # vulnerable2, where2 = check_prov_danger(sent_to_wa, sensitive_api_value)
        if vulnerable:
            to_wa_id += 1
            stri = '_' + str(to_wa_id)
            sent = danger_id_dict['sent_to_wa' + stri] = dict()
            fill_vulnerability_dict(my_dict=sent, wa=sent_to_wa_value, wa_node=sent_to_wa,
                                    where=where)


def search_callback_params(node):
    """ Searches for the parameters of a Callback function, specific to BP_SINKS_2_WA_CB. """

    for child in node.children:
        if isinstance(child, _node.FunctionExpression):
            return child.children[:-1]  # Function's parameters, no function's body
        params = search_callback_params(child)
        if params:
            return params
    return []


def check_pure_exfiltration(danger_id_dict, with_wa, danger):
    """ Checks if the dangerous sink callback is sent to the web app. """

    # If we are here, it means that we have a dangerous sink from BP_SINKS_2_WA_CB. We would like to
    # know if the callback data of this dangerous sink is being transmitted to the web app.

    to_wa_id = 0
    for sent_to_wa in with_wa.sent_list:  # Messages sent to the WA
        sent_to_wa_value = get_node_computed_value_e(sent_to_wa)  # Message sent value
        params = search_callback_params(danger.api_node)  # Callback's params, vulnerable if sent
        for param in params:
            vuln = False
            if isinstance(sent_to_wa, _node.Value) \
                    and id(param) in [id(prov) for prov in sent_to_wa.provenance_parents]:
                vuln = True

            if vuln:
                to_wa_id += 1
                stri = '_' + str(to_wa_id)
                sent = danger_id_dict['sent_to_wa' + stri] = dict()
                fill_vulnerability_dict(my_dict=sent, wa=sent_to_wa_value, wa_node=sent_to_wa,
                                        where=get_node_computed_value_e(param))


def analyze_all_dangers(dangers_list, dangers_dict, with_wa, what):
    """ Analysis of all the dangerous APIs to check their provenance and possibly destination. """
    # what may be:
    # d: for the direct dangers, i: for the indirect dangers, or e: for the pure exfiltration

    danger_id = 0
    for danger in dangers_list:
        danger_id += 1
        logging.debug('Analyzing the dangerous API %s', danger.api_name)
        dangers_id_dict = dangers_dict['danger' + str(danger_id)] = dict()
        dangers_id_dict['danger'] = danger.api_name  # Dangerous sink
        dangers_id_dict['value'] = danger.api_value  # Corresponding value
        if isinstance(danger.api_params, list):
            for danger_nb, _ in enumerate(danger.api_params):
                sink_param_value = get_node_computed_value_e(danger.api_params[danger_nb])
                dangers_id_dict['sink-param' + str(danger_nb + 1)] = sink_param_value  # Param
        dangers_id_dict['line'] = danger.api_node.get_line()  # Corresponding line
        dangers_id_dict['filename'] = danger.api_node.get_file()  # Corresponding file (CS vs. BP)
        dangers_id_dict['dataflow'] = False  # Whether vulnerable or not

        if what != 'e':  # Direct + indirect dangers
            relevant_param = danger_analysis.get_relevant_param(node=danger.api_node,
                                                                api=danger.api_name)
            # We will analyze only the "interesting" parameter(s) of the API considered
            param_id = 0  # Note that param_id != (param original position in sink) as we filtered
            # the param in get_relevant_param. Thus, param_id starts with 0, may avoid confusion
            for danger_param in relevant_param:
                params_id_dict = dangers_id_dict['param_id' + str(param_id)] = dict()
                param_id += 1
                from_wa_id = 0
                for received_from_wa in with_wa.received_list:
                    received_from_wa_value = get_node_computed_value_e(received_from_wa)
                    vulnerable, where = check_prov_danger(danger_param, received_from_wa)
                    if vulnerable:
                        from_wa_id += 1
                        stri = '_' + str(from_wa_id)
                        received = params_id_dict['received_from_wa' + stri] = dict()
                        fill_vulnerability_dict(my_dict=received, wa=received_from_wa_value,
                                                wa_node=received_from_wa, where=where)
                        if what == 'i':  # Only if the sink should leak data to the web app
                            # Checking if data sent back depending on data received (knowing that
                            # danger depending on data received); data sent back not depending on
                            # danger directly but on returned value, which we do not have access to
                            check_data_exfiltration(danger_id_dict=received, with_wa=with_wa,
                                                    sensitive_api=received_from_wa)
                            if any(['to_wa' in k for k in received]):
                                dangers_id_dict['dataflow'] = True  # Vulnerable
                        else:
                            dangers_id_dict['dataflow'] = True  # Vulnerable

        else:  # Exfiltration APIs
            check_pure_exfiltration(danger_id_dict=dangers_id_dict, with_wa=with_wa, danger=danger)
            if any(['to_wa' in k for k in dangers_id_dict]):
                dangers_id_dict['dataflow'] = True  # Vulnerable


def analyze_vulnerabilities(whoami, res_dict, dangers, with_wa, benchmarks):
    """ Analyzes a suspicious extension (= with dangerous sinks) to check if it is vulnerable. """

    start = timeit.default_timer()
    res_dict[whoami] = dict()
    direct_danger_dict = res_dict[whoami]['direct_dangers'] = dict()
    indirect_danger_dict = res_dict[whoami]['indirect_dangers'] = dict()
    exfiltration_dict = res_dict[whoami]['exfiltration_dangers'] = dict()

    analyze_all_dangers(dangers_list=dangers.direct, dangers_dict=direct_danger_dict,
                        with_wa=with_wa, what='d')
    analyze_all_dangers(dangers_list=dangers.indirect, dangers_dict=indirect_danger_dict,
                        with_wa=with_wa, what='i')
    analyze_all_dangers(dangers_list=dangers.exfiltration, dangers_dict=exfiltration_dict,
                        with_wa=with_wa, what='e')

    benchmarks[whoami + ': got vulnerabilities'] = timeit.default_timer() - start
    utility_df.micro_benchmark('Successfully analyzed and collected the vulnerabilities in the '
                               + whoami + ' in', timeit.default_timer() - start)


def analyze_extension_part(pdg, whoami, with_wa, extension_part, benchmarks, chrome, messages_dict,
                           war=False):
    """ Analysis of the PDG of the CS/BP to store (for future analysis):
        - the elements coming from the web app;
        - the elements sent back to the web app;
        - the dangerous sinks encountered. """

    start = timeit.default_timer()
    dangers = extension_part.dangers
    sinks = extension_part.sinks  # Sinks that should be looked for

    # Fills dangers.direct = directly executable sinks
    look_for_vulnerabilities(pdg, whoami=whoami, sinks=sinks.direct, dangers=dangers.direct)
    # Fills dangers.indirect = sinks whose output after execution should be sent back to the web app
    look_for_vulnerabilities(pdg, whoami=whoami, sinks=sinks.indirect, dangers=dangers.indirect)
    # Fills sinks.exfiltration = sinks whose output should be sent back to the web app
    if sinks.exfiltration is not None:
        look_for_vulnerabilities(pdg, whoami=whoami, sinks=sinks.exfiltration,
                                 dangers=dangers.exfiltration)

    # Fills with_wa.received_list and with_wa.sent_list
    wa_communication.web_app_communication(pdg, whoami, with_wa, chrome,
                                           messages_dict=messages_dict)
    if war and whoami == 'bp':  # WAR = BP + with CS - WA communication
        wa_communication.web_app_communication(pdg, whoami='cs', with_wa=with_wa, chrome=chrome,
                                               messages_dict=messages_dict)

    benchmarks[whoami + ': dangers & from WA'] = timeit.default_timer() - start
    utility_df.micro_benchmark('Successfully collected the dangers and elements from the WA in the '
                               + whoami + ' in', timeit.default_timer() - start)


def analyze_extension(cs_path, bp_path, json_analysis=None, pdg=False, chrome=True, war=False,
                      json_messages=None, json_apis='permissions', manifest_path=None):
    """
    Analysis of the complete extension, i.e., CS and BP.

    :param cs_path: str, path of the CS (PDG --> if pdg True) file;
    :param bp_path: str, path of the BP (PDG --> if pdg True) file;
    :param json_analysis: str/None, path of the file to store the analysis results in;
        If None, default will be parent-path-of-<cs_path>/analysis.json.
    :param pdg: bool, True if cs_path/bp_path are PDGs paths;
    :param chrome: bool, whether we are handling a chrome-based extension or not;
    :param war: bool, indicates if the BP is a background / UIpage (False) or a WAR (True);
    :param json_messages: str/None, path of the file to store the messages in;
        If None, default will be parent-path-of-<cs_path>/messages.json. Only Aurore version.
    :param json_apis: str, json file with the APIs to consider (cf suspicious_apis/README).
        - 'permissions' (default): will consider the DoubleX selected APIs for which the
            extension has the corresponding permissions;
        - 'all': will consider DoubleX selected APIs, even if the extension does not have the
            corresponding permissions;
        - 'empoweb': will consider the APIs from the EmPoWeb paper, to use ONLY to run the analysis
            on the EmPoWeb ground-truth dataset.
        - path: will consider the APIs listed in the corresponding json file, which should
            respect our template, cf suspicious_apis/README.
    :param manifest_path: str/None, path of the manifest file.
        If None, default will be parent-path-of-<cs_path>/manifest.json.
    :return:
    """

    utility_df.limit_memory(20 * 10 ** 9)  # Limiting the memory usage to 20GB

    res_dict = dict()
    extension_path = res_dict['extension'] = os.path.dirname(cs_path)
    benchmarks = res_dict['benchmarks'] = dict()
    messages_dict = dict()

    if manifest_path is None:
        manifest_path = os.path.join(extension_path, 'manifest.json')

    pdg_cs, pdg_bp = build_extension_pdg(cs_path=cs_path, bp_path=bp_path, benchmarks=benchmarks,
                                         pdg=pdg, chrome=chrome, messages_dict=messages_dict)
    logging.info('Finished to link CS with BP using the message passing APIs')

    try:
        with utility_df.Timeout(600):  # Tries to analyze an extension within 10 minutes
            sensitive_apis = load_sensitive_apis(json_apis, extension_path, manifest_path,
                                                 benchmarks=benchmarks)
            # APIs to be considered
            if sensitive_apis is None:  # Nothing to analyze
                store_analysis_results(extension_path, json_analysis, json_messages,
                                       res_dict, messages_dict)
                return
            # Dangerous sinks
            extension = danger_analysis.Extension(apis=sensitive_apis)
            cs = extension.cs
            bp = extension.bp
            with_wa = wa_communication.WaCommunication()  # Elts coming from/to WA, initialization

            # Filling with_wa and the different dangers for CS and BP
            utility.print_info('In the CS:')
            analyze_extension_part(pdg_cs, whoami='cs', with_wa=with_wa, extension_part=cs,
                                   benchmarks=benchmarks, chrome=chrome,
                                   messages_dict=messages_dict)
            utility.print_info('---\nIn the BP:')
            analyze_extension_part(pdg_bp, whoami='bp', with_wa=with_wa, extension_part=bp,
                                   benchmarks=benchmarks, chrome=chrome,
                                   messages_dict=messages_dict, war=war)

            if utility.TEST:  # For the automated checks
                return

            utility.print_info('---\nVulnerability detection:')
            analyze_vulnerabilities('cs', res_dict=res_dict, with_wa=with_wa, dangers=cs.dangers,
                                    benchmarks=benchmarks)
            analyze_vulnerabilities('bp', res_dict=res_dict, with_wa=with_wa, dangers=bp.dangers,
                                    benchmarks=benchmarks)

    except utility_df.Timeout.Timeout:
        logging.exception('Analyzing the extension timed out for %s %s', cs_path, bp_path)
        if 'crashes' not in benchmarks:
            benchmarks['crashes'] = []
        benchmarks['crashes'].append('extension-analysis-timeout')

    if PRINT_DEBUG:
        print(json.dumps(res_dict, indent=4, sort_keys=False, default=default, skipkeys=True))
        #print(json.dumps(messages_dict, indent=4, sort_keys=False, default=default, skipkeys=True))

    else:
        store_analysis_results(extension_path, json_analysis, json_messages,
                               res_dict, messages_dict)


def load_sensitive_apis(sensitive_apis_path, extension_path, manifest_path, benchmarks):
    """ Loads the sensitive APIs to consider from the sensitive_apis_path JSON file. """

    if sensitive_apis_path == 'permissions':  # Default case: DoubleX APIs, if permissions
        sensitive_apis_path = check_permissions.generate_json_apis(extension_path, manifest_path)
        if sensitive_apis_path is None or not os.path.isfile(sensitive_apis_path):
            logging.critical('No sensitive APIs can be considered. '
                             'The extension %s cannot be analyzed', extension_path)
            benchmarks['crashes'].append('no-api-file')
            return None

    if sensitive_apis_path == 'all':  # All DoubleX APIs
        sensitive_apis_path = DOUBLEX_APIS_PATH

    elif sensitive_apis_path == 'empoweb':  # EmPoWeb APIs, ONLY for ground truth analysis
        sensitive_apis_path = EMPOWEB_APIS_PATH

    elif sensitive_apis_path is None or not os.path.isfile(sensitive_apis_path):
        # Should not happen, but just in case
        logging.critical('%s is not a valid path. No sensitive APIs can be considered. The '
                         'extension %s cannot be analyzed', sensitive_apis_path, extension_path)
        benchmarks['crashes'].append('no-valid-api-file')
        return None

    with open(sensitive_apis_path) as json_data:
        try:
            return json.loads(json_data.read())
        except json.decoder.JSONDecodeError:  # Should not happen, but just in case
            logging.critical('Something went wrong to open %s', sensitive_apis_path)

    logging.critical('No sensitive APIs can be considered. '
                     'The extension %s cannot be analyzed', extension_path)
    benchmarks['crashes'].append('no-api-considered')
    return None


def store_analysis_results(extension_path, json_analysis, json_messages, res_dict, messages_dict):
    """ Stores the analysis results: res_dict in json_analysis and messages_dict in json_messages,
    for the extension in extension_path. """

    if json_analysis is None:
        json_analysis = os.path.join(extension_path, 'analysis.json')
    with open(json_analysis, 'w') as json_data:
        json.dump(res_dict, json_data, indent=4, sort_keys=False, default=default,
                  skipkeys=True)

    # if json_messages is None:
    #     json_messages = os.path.join(extension_path, 'messages.json')
    # with open(json_messages, 'w') as json_data:
    #     json.dump(messages_dict, json_data, indent=4, sort_keys=False, default=default,
    #               skipkeys=True)


def default(o):
    """ Because of TypeError: Object of type ValueExpr is not JSON serializable.
    Conversion of such objects into str. """

    if isinstance(o, (_node.ValueExpr, _node.FunctionExpression)):
        parent = o.parent
        while True:
            if parent.parent:
                parent = parent.parent
            else:
                break
        if "filename" in parent.attributes:
            filename = parent.attributes["filename"]
            raw_code = open(filename, 'rb').read()[o.attributes['range'][0]:
                                                   o.attributes['range'][1]]
            return re.sub(' {2,}', ' ', raw_code.decode("utf8", "ignore"))

    return str(o)