Code release

oargudo · Nov 11, 2019 · 93fb3b3 · 93fb3b3
1 parent d0a4d68
commit 93fb3b3
Show file tree

Hide file tree

Showing 23 changed files with 5,334 additions and 0 deletions.
diff --git a/Analysis.ipynb b/Analysis.ipynb
@@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "import subprocess\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from utils.coords import *\n",
+    "from utils.shapefiles import *\n",
+    "from analysis.peaksdata import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download prominence and isolation lists from Andrew Kirmse project:\n",
+    "https://github.com/akirmse/mountains"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path to prominence and isolation files\n",
+    "prominenceDB = 'data/prominence-p100.txt'\n",
+    "isolationDB  = 'data/alliso-sorted.txt'\n",
+    "\n",
+    "if not os.path.exists(prominenceDB) or not os.path.exists(isolationDB):\n",
+    "    print('ERROR: peak databases not found!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# region shapefiles\n",
+    "regionShapesDir = 'data/regionShapes'\n",
+    "regionShapes = [f for f in os.listdir(regionShapesDir) if f.endswith('.shp')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Filter and unify prominence and isolation peak lists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regionPeaksDir = 'data/regionPeaks'\n",
+    "\n",
+    "if not os.path.exists(regionPeaksDir):\n",
+    "    os.makedirs(regionPeaksDir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# process each region to filter the database peaks that are inside\n",
+    "# since this process takes a long time, we provide the functions as standalone scripts for batch processing\n",
+    "for region in regionShapes:\n",
+    "    print(region)\n",
+    "    \n",
+    "    print(' 1/3 Filtering prominence DB...')\n",
+    "    subprocess.call('python analysis/filterPoints.py \"%s\" data/prominence-p100.txt prom.txt' \n",
+    "                    % (os.path.join(regionShapesDir, region)))\n",
+    "    print(' 2/3 Filtering isolation DB...')\n",
+    "    subprocess.call('python analysis/filterPoints.py \"%s\" data/alliso-sorted.txt isol.txt' \n",
+    "                    % (os.path.join(regionShapesDir, region)))\n",
+    "\n",
+    "    # merge lists\n",
+    "    print(' 3/3 Merging lists...')\n",
+    "    subprocess.call('python analysis/mergePeaklists.py isol.txt prom.txt tmppeaks.csv --deleteOriginals')\n",
+    "\n",
+    "    # move results to output dir\n",
+    "    shutil.move(\"tmppeaks.csv\", os.path.join(regionPeaksDir, region.replace(\".shp\", \".csv\")))\n",
+    "    \n",
+    "print('done!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute statistics "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regionStatsDir = 'data/regionStats'\n",
+    "\n",
+    "if not os.path.exists(regionStatsDir):\n",
+    "    os.makedirs(regionStatsDir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# statistics disk radius\n",
+    "diskRadius = 30"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "writeFeatures = ['elevation', 'elevRel', 'prominence', 'promRel', \n",
+    "                 'dominance', 'domGroup', 'relevance',\n",
+    "                 'isolation', 'isolDir', 'saddleDist', 'saddleDir']\n",
+    "\n",
+    "def writeHeaderToFile(fout, distributions):\n",
+    "    fout.write('lat,lon,peaks')\n",
+    "    for feat in writeFeatures:\n",
+    "        if feat in ['elevation', 'prominence', 'isolDir', 'saddleDir']:\n",
+    "            for val in distributions[feat]['bins'][:-1]:\n",
+    "                fout.write(',%s_%d' % (feat, int(val)))\n",
+    "        elif feat == 'domGroup':\n",
+    "            for val in distributions[feat]['bins'][:-1]:\n",
+    "                fout.write(',%s_%.2f' % (feat, 100*val))\n",
+    "        else:\n",
+    "            for val in distributions[feat]['bins'][:-1]:\n",
+    "                fout.write(',%s_%.2f' % (feat, val))\n",
+    "    fout.write('\\n')\n",
+    "\n",
+    "def writeLocationStatsToFile(fout, lat, lon, npeaks, distributions):\n",
+    "    fout.write('%.4f,%.4f,%d'%(lat, lon, npeaks))\n",
+    "    for feat in writeFeatures:\n",
+    "        for val in distributions[feat]['hist']:\n",
+    "            fout.write(',%d' % val)\n",
+    "    fout.write('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# process each region (note: it takes a long time!)\n",
+    "for region in regionShapes:\n",
+    "    \n",
+    "    # sample stats locations inside polygon, separated at least 1/2 radius distance\n",
+    "    sampleLocations = sampleShapefileLocations(os.path.join(regionShapesDir, region), diskRadius)\n",
+    "\n",
+    "    # region peaks DB\n",
+    "    df = pd.read_csv(os.path.join(regionPeaksDir, region.replace('.shp', '.csv')))\n",
+    "    df = addExtraColumns(df)\n",
+    "    \n",
+    "    # normalize distance columns\n",
+    "    df['isolation']  /= diskRadius\n",
+    "    df['saddleDist'] /= diskRadius\n",
+    "    \n",
+    "    # results file\n",
+    "    fout = open(os.path.join(regionStatsDir, region.replace('.shp', '.csv')), 'w')\n",
+    "    headerWritten = False\n",
+    "    \n",
+    "    # compute statistics\n",
+    "    for di,diskCenter in enumerate(sampleLocations):\n",
+    "        \n",
+    "        # filter peaks in disk using haversine distance\n",
+    "        peaks = filterPeaksHaversineDist(df, diskCenter, diskRadius)\n",
+    "                \n",
+    "        # skip if not enough peaks\n",
+    "        if peaks.shape[0] < 20:\n",
+    "            continue\n",
+    "        \n",
+    "        # compute statistics\n",
+    "        # diskRadius = 1   to have isolation/saddle dist histograms axis from 0 to 1, note we normalized distances before\n",
+    "        # detailed = False for the classification histograms, for synthesis we double the number of bins\n",
+    "        distributions = computeDistributions(peaks, diskRadius=1.0, detailed=False)\n",
+    "        \n",
+    "        # write dataset headers if first location\n",
+    "        if not headerWritten:\n",
+    "            writeHeaderToFile(fout, distributions)\n",
+    "            headerWritten = True\n",
+    "        \n",
+    "        # write data line\n",
+    "        writeLocationStatsToFile(fout, diskCenter[0], diskCenter[1], peaks.shape[0], distributions)\n",
+    "        \n",
+    "        print('%s: %3d/%3d samples'%(region, di+1, len(sampleLocations)), end='\\r' if di+1 < len(sampleLocations) else '\\n')\n",
+    "        \n",
+    "    fout.close()\n",
+    "\n",
+    "print('done!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# file where the dataset will be stored\n",
+    "fileDataset = 'data/regions_%dkm.csv' % (int(diskRadius))\n",
+    "\n",
+    "# regions to put in the dataset (for example, we could omit certain regions, we can also do it later in classifier)\n",
+    "datasetRegions = [f for f in os.listdir(regionStatsDir) if f.endswith('.csv')]\n",
+    "\n",
+    "# if True: write frequencies, otherwise keep histogram counts\n",
+    "# we observed that frequencies work better in the classifier\n",
+    "normalize = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alldf = []\n",
+    "for file in datasetRegions:\n",
+    "    \n",
+    "    # name\n",
+    "    terrainName = file.split('.')[0]\n",
+    "    \n",
+    "    # read dataframe \n",
+    "    df = pd.read_csv(os.path.join(regionStatsDir, file))\n",
+    "    \n",
+    "    # keep number of peaks\n",
+    "    npeaks = df['peaks'].values\n",
+    "    \n",
+    "    # drop lat, lon, npeaks\n",
+    "    df.drop(['lat', 'lon', 'peaks'], axis=1, inplace=True)\n",
+    "    \n",
+    "    # normalize histogram columns?\n",
+    "    if normalize:\n",
+    "        for c in df.columns:\n",
+    "            df[c] = df[c].astype(np.float32)/npeaks\n",
+    "    \n",
+    "    # add terrain name column\n",
+    "    df.insert(0, 'terrain', terrainName)\n",
+    "    alldf.append(df)\n",
+    "    \n",
+    "    print('%4d %s' % (df.shape[0], terrainName))\n",
+    "    \n",
+    "alldf = pd.concat(alldf, ignore_index=True)\n",
+    "alldf.to_csv(fileDataset, float_format='%.4f', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}