Initial commit

peckto · Apr 21, 2021 · 4dabf06 · 4dabf06
1 parent e43d2f9
commit 4dabf06
Show file tree

Hide file tree

Showing 8 changed files with 451 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,17 @@
 *.class
 *.log
+# scala ignores
+# Simple Build Tool
+# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
+dist/*
+target/
+lib_managed/
+src_managed/
+project/boot/
+project/plugins/project/
+.history
+.cache
+.lib/
+# IDE ignores
+.bsp/
+.idea/
diff --git a/README.md b/README.md
@@ -1,2 +1,122 @@
 # joernCpgExport
-Export ShiftLeft Code Property Graph (cpg) from OverflowDB to json and csv
+Export ShiftLeft Code Property Graph (cpg) from OverflowDB to json and csv.
+The csv can be imported into the Neo4j graph database.
+Neo4j can be used to visualize the graph and to write queries via cypher.
+
+**Please note:** The OverflowDB file format is not compatible between different versions.
+Be sure to use the same OverflowDB version in joern and joernCpgExport.
+
+Reference:
+* https://github.com/ShiftLeftSecurity/codepropertygraph
+* https://github.com/ShiftLeftSecurity/joern/
+* https://github.com/ShiftLeftSecurity/overflowdb
+* https://neo4j.com/
+
+## Build
+```shell
+$ sbt stage
+```
+
+## Usage
+```shell
+$ ./target/universal/stage/bin/joernCpgExport --help
+Usage: joernCpgExport [options]
+
+  -d, --no-default-overlays  do not apply default overlays
+  -o, --no-oss-dataflow      do not apply oss dataflow overlay
+  -c, --cpg <cpg.bin>        load cpg from OverflowDB
+  -j, --json <cpg.json>      export cpg as json file
+  --help                     prints this usage text
+```
+
+## Example
+```shell
+# use fuzzyc2cpg from joern to parse source code and create OverflowDB
+$ fuzzyc2cpg.sh --output cpg.bin src
+# convert OverflowDB into json
+$ joernCpgExport --cpg cpg.bin --json cpg.json
+```
+
+cpg.json:
+```json
+{
+  "nodes" : [ {
+    "ID" : 1,
+    "OVERLAYS" : [ "semanticcpg", "dataflowOss" ],
+    "VERSION" : "0.1",
+    "LANGUAGE" : "C",
+    "TYPE" : "META_DATA"
+  }, {
+    "ID" : 2,
+    "FULL_NAME" : "<global>",
+    "ORDER" : 1,
+    "TYPE" : "NAMESPACE_BLOCK",
+    "NAME" : "<global>",
+    "FILENAME" : "<unknown>"
+  }, {
+    "ID" : 100,
+    "TYPE_DECL_FULL_NAME" : "",
+    "NAME" : "",
+    "FULL_NAME" : "",
+    "TYPE" : "TYPE"
+  }, {
+    "ID" : 101,
+    "TYPE_DECL_FULL_NAME" : " [ 1 ]",
+    "NAME" : " [ 1 ]",
+    "FULL_NAME" : " [ 1 ]",
+    "TYPE" : "TYPE"
+  },
+  [...]
+  ]
+}
+```
+
+## Convert json to csv
+You can convert the json file into csv, grouped by node type.
+The csv files can be imported into Neo4j via bulk import.
+The csv files will be created in the `csv` subfolder of the basedir of the provided `cpg.json`.
+Existing files will be overwritten.
+```shell
+# we need pandas as dependency
+$ pip3 install --user pandas
+$ scripts/joern-json-csv.py <cpg.json>
+```
+csv/edge.csv:
+```csv
+:END_ID,:START_ID,:TYPE,VARIABLE
+1024809,1313,REF,
+1154373,1314,REF,
+1024810,1315,REF,
+1154374,1316,REF,
+1030528,1317,REF,
+1154375,1318,REF,
+1024811,1319,REF,
+1154376,1320,REF,
+1154377,1321,REF,
+1154378,1322,REF,
+1154379,1323,REF,
+```
+
+## Import into Neo4j
+Neo4j offers two methods to import csv data: 1) via cypher `LOAD CSV` command and 2) via the `neo4j-admin` command.
+For (1) you need first to create the graph model. Data types of the csv columns must be explicitly converted during loading.
+For (2) the database must be empty (delete all nodes and relations seems not to be sufficient). Data types are detected automatically, or via column header.
+See also: https://neo4j.com/developer/guide-import-csv/
+
+Neo4j commands:
+```shell
+# stop Neo4j service
+$ neo4j stop
+# delete database
+$ rm -rf data
+# set Neo4j admin password
+$ neo4j-admin set-initial-password <password>
+# batch import
+$ neo4j-admin import --multiline-fields=true --quote=\" --relationships=csv/edge.csv --nodes=csv/vertex_BLOCK.csv --nodes=csv/vertex_CALL.csv ...
+# start Neo4j service
+$ neo4j start
+```
+
+Cpg viewed via Neo4j web UI:
+
+![graph](res/graph.png)
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,95 @@
+
+// The simplest possible sbt build file is just one line:
+
+scalaVersion := "2.13.3"
+// That is, to create a valid sbt build, all you've got to do is define the
+// version of Scala you'd like your project to use.
+
+// ============================================================================
+
+// Lines like the above defining `scalaVersion` are called "settings". Settings
+// are key/value pairs. In the case of `scalaVersion`, the key is "scalaVersion"
+// and the value is "2.13.3"
+
+// It's possible to define many kinds of settings, such as:
+
+name := "joernCpgExport"
+organization := "de.peckto"
+version := "1.0"
+
+// Note, it's not required for you to define these three settings. These are
+// mostly only necessary if you intend to publish your library's binaries on a
+// place like Sonatype or Bintray.
+
+
+// Want to use a published library in your project?
+// You can define other libraries as dependencies in your build like this:
+
+libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.2"
+
+// Here, `libraryDependencies` is a set of dependencies, and by using `+=`,
+// we're adding the scala-parser-combinators dependency to the set of dependencies
+// that sbt will go and fetch when it starts up.
+// Now, in any Scala file, you can import classes, objects, etc., from
+// scala-parser-combinators with a regular import.
+
+// TIP: To find the "dependency" that you need to add to the
+// `libraryDependencies` set, which in the above example looks like this:
+
+// "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.2"
+
+// You can use Scaladex, an index of all known published Scala libraries. There,
+// after you find the library you want, you can just copy/paste the dependency
+// information that you need into your build file. For example, on the
+// scala/scala-parser-combinators Scaladex page,
+// https://index.scala-lang.org/scala/scala-parser-combinators, you can copy/paste
+// the sbt dependency from the sbt box on the right-hand side of the screen.
+
+// IMPORTANT NOTE: while build files look _kind of_ like regular Scala, it's
+// important to note that syntax in *.sbt files doesn't always behave like
+// regular Scala. For example, notice in this build file that it's not required
+// to put our settings into an enclosing object or class. Always remember that
+// sbt is a bit different, semantically, than vanilla Scala.
+
+// ============================================================================
+
+// Most moderately interesting Scala projects don't make use of the very simple
+// build file style (called "bare style") used in this build.sbt file. Most
+// intermediate Scala projects make use of so-called "multi-project" builds. A
+// multi-project build makes it possible to have different folders which sbt can
+// be configured differently for. That is, you may wish to have different
+// dependencies or different testing frameworks defined for different parts of
+// your codebase. Multi-project builds make this possible.
+
+// Here's a quick glimpse of what a multi-project build looks like for this
+// build, with only one "subproject" defined, called `root`:
+
+// lazy val root = (project in file(".")).
+//   settings(
+//     inThisBuild(List(
+//       organization := "ch.epfl.scala",
+//       scalaVersion := "2.13.3"
+//     )),
+//     name := "hello-world"
+//   )
+
+// To learn more about multi-project builds, head over to the official sbt
+// documentation at http://www.scala-sbt.org/documentation.html
+
+val cpgVersion = "1.3.121"
+
+libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.3" % Runtime
+
+libraryDependencies += "io.shiftleft" % "overflowdb-tinkerpop3" % "1.11"
+
+libraryDependencies += "io.shiftleft" %% "codepropertygraph" % cpgVersion
+libraryDependencies += "io.shiftleft" %% "semanticcpg" % cpgVersion
+libraryDependencies += "io.shiftleft" %% "dataflowengineoss" % cpgVersion
+
+libraryDependencies += "com.fasterxml.jackson.core" % "jackson-databind" % "2.2.2"
+libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.12.2"
+libraryDependencies += "com.github.scopt" %% "scopt" % "4.0.0"
+
+enablePlugins(JavaAppPackaging)
+
+scalacOptions := List("-encoding", "utf8", "-Xfatal-warnings", "-deprecation", "-unchecked")
diff --git a/project/build.properties b/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.4.9
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1")
diff --git a/res/graph.png b/res/graph.png
diff --git a/scripts/joern-json-csv.py b/scripts/joern-json-csv.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import sys
+import os
+import json
+import pandas as pd
+
+
+def joern_json_csv(path):
+    base = os.path.dirname(path)
+    csv_dir = os.path.join(base, 'csv')
+    if not os.path.exists(csv_dir):
+        os.mkdir(csv_dir)
+
+    j = json.load(open(path))
+    vertex = pd.DataFrame(j['nodes'])
+    edge = pd.DataFrame(j['edges'])
+    d = {'nodes': []}
+    for label in vertex['TYPE'].unique():
+        df = vertex[vertex['TYPE'] == label]
+        df = df.dropna(how='all', axis=1)
+        df = df.rename(columns={'ID': f'{label}:ID', 'TYPE': ':LABEL'})
+        f = os.path.join(csv_dir, f'vertex_{label}.csv')
+        d['nodes'].append(f)
+        df.to_csv(f, index=False)
+
+    edge = edge.dropna(how='all', axis=1)
+    edge = edge.rename(
+        columns={'outV': ':END_ID', 'inV': ':START_ID', 'TYPE': ':TYPE'})
+    f = os.path.join(csv_dir, 'edge.csv')
+    d['relationships'] = f
+    edge.to_csv(f, index=False)
+
+    return d
+
+
+if __name__ == '__main__':
+    joern_json_csv(sys.argv[1])
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1")