Skip to content

Commit

Permalink
add report and clean project
Browse files Browse the repository at this point in the history
Signed-off-by: M-HRL <[email protected]>
  • Loading branch information
M-HRL committed Jan 10, 2024
1 parent deab4e9 commit 23a9d9f
Show file tree
Hide file tree
Showing 20 changed files with 1,286 additions and 100 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@
.idea
.code
.obsidian
venv
venv
project/cache
project/.ipynb_checkpoints
4 changes: 2 additions & 2 deletions project/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from src.block.loader.path_sqlite_loader_block import PathSqliteLoaderBlock
from src.block.loader.ride_sqlite_loader_block import RideSqliteLoaderBlock
from src.block.transformer.clean_ride_transformer_block import CleanRideTransformerBlock
from src.block.transformer.extend_path_transformer_block import ExtendPathTransformerBlock
from src.block.transformer.convert_path_coordinates_transformer_block import ConvertPathCoordinatesTransformerBlock

db_file_path = os.path.join(os.path.dirname(__file__), "../data/bike_data.sqlite")
ride_pipeline = Pipeline().register(RideExtractorBlock()).register(CleanRideTransformerBlock()).register(
RideSqliteLoaderBlock())
path_pipeline = Pipeline().register(PathExtractorBlock()).register(ExtendPathTransformerBlock()).register(
path_pipeline = Pipeline().register(PathExtractorBlock()).register(ConvertPathCoordinatesTransformerBlock()).register(
PathSqliteLoaderBlock())

if __name__ == '__main__':
Expand Down
986 changes: 986 additions & 0 deletions project/report.ipynb

Large diffs are not rendered by default.

23 changes: 11 additions & 12 deletions project/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
attrs==22.2.0
greenlet==2.0.2
iniconfig==2.0.0
numpy==1.24.2
packaging==23.0
pandas==1.5.3
pluggy==1.0.0
numpy==1.26.3
pandas==2.1.4
python-dateutil==2.8.2
pytz==2022.7.1
six==1.16.0
SQLAlchemy==1.4.46
typing_extensions==4.5.0
pytest==7.4.3
SQLAlchemy==2.0.25
typing_extensions==4.9.0
pytest==7.4.4
osmnx==1.8.1
matplotlib==3.8.2
scikit-learn==1.3.2
pyproj==3.6.1
notebook==7.0.6
networkx==3.2.1
7 changes: 2 additions & 5 deletions project/src/block/extractor/path_csv_extractor_block.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import pandas as pd

from src.block.extractor.extractor_block import ExtractorBlock
from src.model.entities import Path


class PathExtractorBlock(ExtractorBlock):
def __init__(self):
self.url = "https://opendata.muenchen.de/dataset/7ad3bc6c-4c1a-4a63-9cb2-0d613f5b69fa/resource/14977232-94f3-4cdb-94fc-1e709698ba3f/download/radwege_t2.csv"
def __init__(self, url: str):
self.url = url

def invoke(self, *args) -> tuple[pd.DataFrame]:
return pd.read_csv(self.url, sep=r"\s*,", skipinitialspace=True, header=0,
Expand All @@ -20,6 +19,4 @@ def invoke(self, *args) -> tuple[pd.DataFrame]:
"end_east",
"end_north",
],
index_col="id",
dtype=Path,
engine="python"),
41 changes: 21 additions & 20 deletions project/src/block/extractor/ride_csv_extractor_block.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@
import pandas as pd

from src.block.extractor.extractor_block import ExtractorBlock
from src.model.entities import Ride


class RideExtractorBlock(ExtractorBlock):
def __init__(self):
self.url = "https://www.mvg.de/dam/mvg/services/mobile-services/mvg-rad/fahrten-csv/MVG_Rad_Fahrten_2022.zip"
def __init__(self, url: str, frac: float, seed: int):
self.url = url
self.frac = frac
self.seed = seed

def invoke(self, *args) -> tuple[pd.DataFrame]:
return pd.read_csv(self.url, compression="zip", sep=r"\s*;", skipinitialspace=True, header=0,
names=[
"id",
"start_time",
"end_time",
"start_lat",
"start_lon",
"end_lat",
"end_lon",
"rental_is_station",
"rental_station_name",
"return_is_station",
"return_station_name"
],
index_col="id",
dtype=Ride,
engine="python"),
ride_df = pd.read_csv(self.url, compression="zip", sep=r"\s*;", skipinitialspace=True, header=0,
names=[
"id",
"start_time",
"end_time",
"start_lat",
"start_lon",
"end_lat",
"end_lon",
"rental_is_station",
"rental_station_name",
"return_is_station",
"return_station_name",
],
engine="python")
ride_df = ride_df.sample(frac=self.frac, random_state=self.seed)
return ride_df,
6 changes: 0 additions & 6 deletions project/src/block/loader/loader_block.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
import pandas as pd
from sqlalchemy import create_engine

from src.base.block import Block
from src.model.entities import init_all_tables


class LoaderBlock(Block):
def __init__(self):
conn_string = "sqlite:///../data/bike_data.sqlite"
self.engine = create_engine(conn_string, echo=True, future=True)
init_all_tables(self.engine)

def invoke(self, *args: pd.DataFrame):
return args
14 changes: 11 additions & 3 deletions project/src/block/loader/path_sqlite_loader_block.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import pandas as pd
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session

from src.block.loader.loader_block import LoaderBlock
from src.model.entities import Path


class PathSqliteLoaderBlock(LoaderBlock):
def __init__(self):
super().__init__()
def __init__(self, engine: Engine):
self.engine = engine

def invoke(self, path_df: pd.DataFrame):
path_df.to_sql("path", con=self.engine, if_exists="replace")
records = path_df.to_dict("records")
with Session(self.engine) as session:
for record in records:
path = Path(**record)
session.add(path)
session.commit()
14 changes: 11 additions & 3 deletions project/src/block/loader/ride_sqlite_loader_block.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import pandas as pd
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session

from src.block.loader.loader_block import LoaderBlock
from src.model.entities import Ride


class RideSqliteLoaderBlock(LoaderBlock):
def __init__(self):
super().__init__()
def __init__(self, engine: Engine):
self.engine = engine

def invoke(self, ride_df: pd.DataFrame):
ride_df.to_sql("ride", con=self.engine, if_exists="replace")
records = ride_df.to_dict("records")
with Session(self.engine) as session:
for record in records:
ride = Ride(**record)
session.add(ride)
session.commit()
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import networkx as nx
import pandas as pd
from osmnx.distance import nearest_nodes
from osmnx.routing import shortest_path

from src.block.transformer.transformer_block import TransformerBlock
from src.model.entities import Node


def map_node_ids_to_nodes(node_ids: list[int]) -> list[Node]:
return [Node(osm_id=node_id, route_order=idx) for idx, node_id in enumerate(node_ids)] if node_ids else []


class AddNodeMappingsTransformerBlock(TransformerBlock):
def __init__(self, cpus: int, graph: nx.MultiDiGraph):
self.cpus = cpus
self.graph = graph

def invoke(self, ride_df: pd.DataFrame, path_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
ride_start_node_ids = nearest_nodes(self.graph, X=ride_df["start_lon"].astype(float).to_list(),
Y=ride_df["start_lat"].astype(float).to_list())
ride_end_node_ids = nearest_nodes(self.graph, X=ride_df["end_lon"].astype(float).to_list(),
Y=ride_df["end_lat"].astype(float).to_list())

ride_df["nodes"] = shortest_path(self.graph, ride_start_node_ids, ride_end_node_ids, weight="travel_time",
cpus=self.cpus)
ride_df["nodes"] = ride_df["nodes"].map(map_node_ids_to_nodes)

path_start_node_ids = nearest_nodes(self.graph, X=path_df["start_lon"].to_list(),
Y=path_df["start_lat"].to_list())
path_end_node_ids = nearest_nodes(self.graph, X=path_df["end_lon"].to_list(), Y=path_df["end_lat"].to_list())

path_df["nodes"] = list(zip(path_start_node_ids, path_end_node_ids))
path_df["nodes"] = path_df["nodes"].map(map_node_ids_to_nodes)

return ride_df, path_df
21 changes: 21 additions & 0 deletions project/src/block/transformer/clean_path_transformer_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd

from src.block.transformer.transformer_block import TransformerBlock


class CleanPathTransformerBlock(TransformerBlock):
def invoke(self, path_df: pd.DataFrame) -> tuple[pd.DataFrame]:
path_df["id"] = range(len(path_df))
path_df["shape_length"] = pd.to_numeric(path_df["shape_length"], errors="coerce").astype(float)
path_df["start_east"] = pd.to_numeric(path_df["start_east"], errors="coerce").astype(float)
path_df["start_north"] = pd.to_numeric(path_df["start_north"], errors="coerce").astype(float)
path_df["end_east"] = pd.to_numeric(path_df["end_east"], errors="coerce").astype(float)
path_df["end_north"] = pd.to_numeric(path_df["end_north"], errors="coerce").astype(float)
return path_df.dropna(axis=0, subset=[
"shape_length",
"path_type",
"start_east",
"start_north",
"end_east",
"end_north",
]),
20 changes: 19 additions & 1 deletion project/src/block/transformer/clean_ride_transformer_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,22 @@

class CleanRideTransformerBlock(TransformerBlock):
def invoke(self, ride_df: pd.DataFrame) -> tuple[pd.DataFrame]:
return ride_df.dropna(axis=0, subset=["return_is_station"]),
ride_df["id"] = range(len(ride_df))
ride_df["start_time"] = pd.to_datetime(ride_df["start_time"], errors="coerce")
ride_df["end_time"] = pd.to_datetime(ride_df["end_time"], errors="coerce")
ride_df["start_lat"] = pd.to_numeric(ride_df["start_lat"], errors="coerce").astype(float)
ride_df["start_lon"] = pd.to_numeric(ride_df["start_lon"], errors="coerce").astype(float)
ride_df["end_lat"] = pd.to_numeric(ride_df["end_lat"], errors="coerce").astype(float)
ride_df["end_lon"] = pd.to_numeric(ride_df["end_lon"], errors="coerce").astype(float)
ride_df["rental_is_station"] = pd.to_numeric(ride_df["rental_is_station"], errors="coerce").astype(bool)
ride_df["return_is_station"] = pd.to_numeric(ride_df["return_is_station"], errors="coerce").astype(bool)
return ride_df.dropna(axis=0, subset=[
"start_time",
"end_time",
"start_lat",
"start_lon",
"end_lat",
"end_lon",
"rental_is_station",
"return_is_station",
]),
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from pyproj import Proj
from pyproj.enums import TransformDirection

from src.block.transformer.transformer_block import TransformerBlock


class ConvertPathCoordinatesTransformerBlock(TransformerBlock):
def invoke(self, path_df: pd.DataFrame) -> tuple[pd.DataFrame]:
# Define the UTM zone for Germany
utm_zone = 32
# Create a transformer
proj = Proj(proj='utm', zone=utm_zone, ellps='WGS84', preserve_units=True)
# Convert UTM to Longitude / Latitude
start_lons, start_lats = proj.transform(xx=path_df["start_east"],
yy=path_df["start_north"],
direction=TransformDirection.INVERSE)
path_df["start_lon"] = start_lons
path_df["start_lat"] = start_lats
end_lons, end_lats = proj.transform(xx=path_df["end_east"],
yy=path_df["end_north"],
direction=TransformDirection.INVERSE)
path_df["end_lon"] = end_lons
path_df["end_lat"] = end_lats
# Drop converted columns
columns_to_drop = ["start_east", "start_north", "end_east", "end_north"]
path_df.drop(columns=columns_to_drop, inplace=True)
return path_df,

This file was deleted.

2 changes: 1 addition & 1 deletion project/src/block/transformer/transformer_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@


class TransformerBlock(Block):
def invoke(self, *args: pd.DataFrame) -> tuple[pd.DataFrame]:
def invoke(self, *args: pd.DataFrame) -> tuple:
return args
Loading

0 comments on commit 23a9d9f

Please sign in to comment.