Skip to content

Commit

Permalink
First commit extracted from https://github.com/ozone-his/ozone-analytics
Browse files Browse the repository at this point in the history
  • Loading branch information
enyachoke committed Jul 6, 2022
0 parents commit 7b3e6f1
Show file tree
Hide file tree
Showing 51 changed files with 2,873 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
target/
116 changes: 116 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

# Created by https://www.toptal.com/developers/gitignore/api/intellij
# Edit at https://www.toptal.com/developers/gitignore?templates=intellij

### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
.idea/
target/
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml
dependency-reduced-pom.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/

# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml

# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/

# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$

# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml

# End of https://www.toptal.com/developers/gitignore/api/intellij
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

FROM maven:3.5-jdk-8-alpine as builder
# add pom.xml and source code
ADD ./pom.xml pom.xml
#cache dependencies
RUN mvn dependency:go-offline
ADD ./src src/
RUN mvn clean package

FROM flink:1.14.4-scala_2.12-java8
RUN wget https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc_2.12/1.14.4/flink-connector-jdbc_2.12-1.14.4.jar -O /opt/flink/lib/flink-connector-jdbc_2.12-1.14.4.jar
RUN wget https://repo1.maven.org/maven2/org/apache/flink/flink-parquet_2.11/1.13.5/flink-parquet_2.11-1.13.5.jar -O /opt/flink/lib/flink-parquet_2.11-1.13.5.jar
RUN wget https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar -O /opt/flink/lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar
RUN wget https://repo1.maven.org/maven2/org/apache/parquet/parquet-hadoop/1.12.2/parquet-hadoop-1.12.2.jar -O /opt/flink/lib/parquet-hadoop-1.12.2.jar
RUN wget https://repo1.maven.org/maven2/org/apache/parquet/parquet-common/1.12.2/parquet-common-1.12.2.jar -O /opt/flink/lib/parquet-common-1.12.2.jar
RUN wget https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.13/httpclient-4.5.13.jar -O /opt/flink/lib/httpclient-4.5.13.jar
RUN wget https://repo1.maven.org/maven2/org/apache/httpcomponents/httpcore/4.4.15/httpcore-4.4.15.jar -O /opt/flink/lib/httpcore-4.4.15.jar
RUN wget https://repo1.maven.org/maven2/com/ecwid/consul/consul-api/1.4.5/consul-api-1.4.5.jar -O /opt/flink/lib/consul-api-1.4.5.jar
RUN wget https://repo1.maven.org/maven2/com/google/code/gson/gson/2.9.0/gson-2.9.0.jar -O /opt/flink/lib/gson-2.9.0.jar
COPY --from=builder target/ozone-etl-flink-1.0-SNAPSHOT-etl-streaming.jar /opt/flink/usrlib/streaming-etl-job.jar
COPY --from=builder target/ozone-etl-flink-1.0-SNAPSHOT-etl-migrations.jar /opt/flink/usrlib/ozone-etl-migrations.jar
COPY run.sh /run.sh
RUN chmod +x /run.sh
ENTRYPOINT ["/docker-entrypoint.sh"]
14 changes: 14 additions & 0 deletions Dockerfile_batch
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM maven:3.5-jdk-8-alpine as builder
# add pom.xml and source code
ADD ./pom.xml pom.xml
ADD ./src src/
RUN mvn clean install -Pbatch

FROM openjdk:8-jre-alpine
ENV OUTPUT_DIR=/parquet
COPY --from=builder target/ozone-etl-flink-1.0-SNAPSHOT-etl-export.jar etl-export.jar
COPY --from=builder target/ozone-etl-flink-1.0-SNAPSHOT-etl-migrations.jar /opt/flink/usrlib/ozone-etl-migrations.jar
ADD ./batch-export.sh ./batch-export.sh
COPY run.sh /run.sh
RUN chmod +x /run.sh
CMD [ "/bin/sh","./batch-export.sh" ]
88 changes: 88 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

# Ozone ETL pipelines

## Flink



This repository contains an ETL [Flink](hhttps://ci.apache.org/projects/flink/flink-docs-master/) [job](https://ci.apache.org/projects/flink/flink-docs-master/docs/internals/job_scheduling/#:~:text=A%20Flink%20job%20is%20first,it%20cancels%20all%20running%20tasks) for flattening [Ozone HIS](https://github.com/ozone-his) data.

## Features



- Provides both [batch]() and [streaming]() modes

- Currently flattens OpenMRS to output reporting friendly tables for:

- patients

- observations

- visits

- concepts



## Tech

- [Flink](hhttps://ci.apache.org/projects/flink/flink-docs-master/)

- [Flink CDC connectors](https://github.com/ververica/flink-cdc-connectors) - For streaming



## Building and Installation



### Prerequisites

- A running Flink [installation](https://ci.apache.org/projects/flink/flink-docs-release-1.13/docs/try-flink/local_installation/)

- A running OpenMRS installation

- A running PostgreSQL installation

- A liquibase installation



### Build and install

- Update the [job.properties](./job.properties) files with the details for your OpenMRS MySQL database and the analytics PostgreSQL database

- Build with `mvn clean package`

- Retrieve and apply the [Liquibase file](https://github.com/ozone-his/ozonepro-docker/tree/master/flink/liquidbase) needed to create tables on the analytics database (more on installation and usage of Liquibase see [liquibase](https://www.liquibase.org/get-started/quickstart))

- Run with `flink run -m <flink-job-manager-url> target/ozone-etl-flink-1.0-SNAPSHOT.jar`

#### Building Docker image

`docker build -t mekomsolutions/ozone-flink-jobs .`

### Layout

src/main/java/net/mekomsolutions/data/pipelines/batch

src/main/java/net/mekomsolutions/data/pipelines/export

src/main/java/net/mekomsolutions/data/pipelines/streaming

src/main/java/net/mekomsolutions/data/pipelines/utils

src/main/java/net/mekomsolutions/data/pipelines/shared



### Adding new jobs
#### Sources and Sinks
This project uses flink's [Table API & SQL](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/dev/table/overview/) ,for you to access data you need to setup [temporary](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/dev/table/sql/create/) tables both for data sources and data sinks. For most cases all the tables you will every need for writting OpenMRS data processing jobs are already added under `src/main/java/net/mekomsolutions/data/pipelines/shared/dsl` . But incase you need to add more you can add them to either `src/main/java/net/mekomsolutions/data/pipelines/shared/dsl/source` or `src/main/java/net/mekomsolutions/data/pipelines/shared/dsl/sink` You will then need to register them in factory class `src/main/java/net/mekomsolutions/data/pipelines/shared/dsl/TableDSLFactory.java` . See `src/main/java/net/mekomsolutions/data/pipelines/streaming/StreamingETLJob.java` for an example of how to use the factory to setup sources and sinks.

#### Setting up jobs
Assuming you already have all the source and sink tables setup adding new jobs involves;

- Adding your SQL files to `src/main/resources`
- Registering the jobs in `flink-jobs/src/main/resources/jobs.json`
7 changes: 7 additions & 0 deletions batch-export.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh
: ${JDBC_URL?"Need to set JDBC_URL"}
: ${JDBC_USERNAME:?"Need to set JDBC_USERNAME"}
: ${JDBC_PASSWORD:?"Need to set JDBC_PASSWORDy"}
: ${OUTPUT_DIR:?"Need to set OUTPUT_DIR"}
: ${LOCATION_TAG:?"Need to set LOCATION_TAG"}
java -jar etl-export.jar --jdbc-url $JDBC_URL --jdbc-username $JDBC_USERNAME --jdbc-password $JDBC_PASSWORD --output-dir $OUTPUT_DIR --location-tag $LOCATION_TAG
16 changes: 16 additions & 0 deletions job.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
job.streaming=true
openmrs.database.username=root
openmrs.database.password=3cY8Kve4lGey
openmrs.database.batch.url=jdbc:mysql://localhost:3306/openmrs
openmrs.database.streaming.hostname=localhost
openmrs.database.streaming.port=3306
openmrs.database.streaming.database-name=openmrs
properties.bootstrap.servers=localhost:9092


analytics.database.username=postgres
analytics.database.password=password
analytics.database.batch.url=jdbc:postgresql://localhost:5432/analytics

analytics.filesystem=true

Loading

0 comments on commit 7b3e6f1

Please sign in to comment.