Skip to content

Datasets on Elasticsearch

Keith Alcock edited this page Jan 9, 2025 · 1 revision

Datasets are slowly being made available via an Elasticsearch instance. People needing access can be provided with credentials and API tokens. The name of the main index has been incremented from habitus to habitus5 as datasets have been added, in case people need to have repeatable results. The alias, habitus, has recently been added to point to the most recent and/or correct version of the data.

  • habitus = an initial version containing only uganda-mining.tsv
  • habitus2 = uganda-mining.tsv only
  • habitus3 = habitus2 + uganda-pdfs-karamoja.tsv + uganda-pdfs.tsv + uganda.tsv
  • habitus4 = habitus3 + dataset55k.tsv, which was found to have errors
  • habitus5 = habitus3 + dataset55k.tsv, corrected + ghana-regulations.tsv + ghana-set.tsv + ghana-sitemap.tsv + ghana-stakeholders.tsv + senegal-experiment.tsv + senegal-peanuts.tsv + senegal-rice.tsv + senegal-saed.tsv datasets
  • habitus6 = an intermediate version
  • habitus7 = habitus5 with sanitizedLemmas, sanitizedNamedEntities, and updated locations added
  • habitus = now an alias for habitus7

For habitus7, the schema/mapping looks like this:

"properties": {
  "byline": {
    "type": "wildcard"
  },
  "causalRelations": {
    "type": "nested",
    "properties": {
      "cause": {
        "type": "nested",
        "properties": {
          "decCount": {
            "type": "integer"
          },
          "incCount": {
            "type": "integer"
          },
          "negCount": {
            "type": "integer"
          },
          "posCount": {
            "type": "integer"
          },
          "text": {
            "type": "text"
          }
        }
      },
      "effect": {
        "type": "nested",
        "properties": {
          "decCount": {
            "type": "integer"
          },
          "incCount": {
            "type": "integer"
          },
          "negCount": {
            "type": "integer"
          },
          "posCount": {
            "type": "integer"
          },
          "text": {
            "type": "text"
          }
        }
      },
      "index": {
        "type": "integer"
      },
      "negationCount": {
        "type": "integer"
      }
    }
  },
  "chatVector": {
    "type": "dense_vector",
    "dims": 384,
    "index": true,
    "similarity": "dot_product"
  },
  "contextAfter": {
    "type": "text",
    "index": false
  },
  "contextBefore": {
    "type": "text",
    "index": false
  },
  "contextLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "dataset": {
    "type": "keyword"
  },
  "date": {
    "type": "date"
  },
  "dateline": {
    "type": "wildcard"
  },
  "isBelief": {
    "type": "boolean"
  },
  "nextDistance": {
    "type": "integer"
  },
  "nextLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "prevDistance": {
    "type": "integer"
  },
  "prevLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "region": {
    "type": "keyword"
  },
  "sanitized": {
    "type": "boolean"
  },
  "sanitizedLemmas": {
    "properties": {
      "start": {
        "type": "integer"
      },
      "stop": {
        "type": "integer"
      },
      "text": {
        "type": "text"
      }
    }
  },
  "sanitizedLocations": {
    "type": "boolean"
  },
  "sanitizedNamedEntities": {
    "properties": {
      "start": {
        "type": "integer"
      },
      "stop": {
        "type": "integer"
      },
      "text": {
        "type": "text"
      }
    }
  },
  "sentence": {
    "type": "text"
  },
  "sentenceIndex": {
    "type": "integer"
  },
  "sentenceLocations": {
    "type": "nested",
    "properties": {
      "geonameid": {
        "type": "keyword"
      },
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      },
      "ranges": {
        "properties": {
          "start": {
            "type": "integer"
          },
          "stop": {
            "type": "integer"
          }
        }
      }
    }
  },
  "sentiment": {
    "type": "float"
  },
  "terms": {
    "type": "keyword"
  },
  "title": {
    "type": "text"
  },
  "url": {
    "type": "wildcard"
  }
}

For habitus5 and before, the schema/mapping was somewhat simpler:

"properties": {
  "byline": {
    "type": "wildcard"
  },
  "causalRelations": {
    "type": "nested",
    "properties": {
      "cause": {
        "type": "nested",
        "properties": {
          "decCount": {
            "type": "integer"
          },
          "incCount": {
            "type": "integer"
          },
          "negCount": {
            "type": "integer"
          },
          "posCount": {
            "type": "integer"
          },
          "text": {
            "type": "text"
          }
        }
      },
      "effect": {
        "type": "nested",
        "properties": {
          "decCount": {
            "type": "integer"
          },
          "incCount": {
            "type": "integer"
          },
          "negCount": {
            "type": "integer"
          },
          "posCount": {
            "type": "integer"
          },
          "text": {
            "type": "text"
          }
        }
      },
      "index": {
        "type": "integer"
      },
      "negationCount": {
        "type": "integer"
      }
    }
  },
  "chatVector": {
    "type": "dense_vector",
    "dims": 384,
    "index": true,
    "similarity": "dot_product"
  },
  "contextAfter": {
    "type": "text",
    "index": false
  },
  "contextBefore": {
    "type": "text",
    "index": false
  },
  "contextLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "dataset": {
    "type": "keyword"
  },
  "date": {
    "type": "date"
  },
  "dateline": {
    "type": "wildcard"
  },
  "isBelief": {
    "type": "boolean"
  },
  "nextDistance": {
    "type": "integer"
  },
  "nextLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "prevDistance": {
    "type": "integer"
  },
  "prevLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "region": {
    "type": "keyword"
  },
  "sanitized": {
    "type": "boolean"
  },
  "sanitizedLemmas": {
    "properties": {
      "start": {
        "type": "integer"
      },
      "stop": {
        "type": "integer"
      },
      "text": {
        "type": "text"
      }
    }
  },
  "sanitizedNamedEntities": {
    "properties": {
      "start": {
        "type": "integer"
      },
      "stop": {
        "type": "integer"
      },
      "text": {
        "type": "text"
      }
    }
  },
  "sentence": {
    "type": "text"
  },
  "sentenceIndex": {
    "type": "integer"
  },
  "sentenceLocations": {
    "type": "nested",
    "properties": {
      "location": {
        "type": "geo_point"
      },
      "name": {
        "type": "wildcard"
      }
    }
  },
  "sentiment": {
    "type": "float"
  },
  "terms": {
    "type": "keyword"
  },
  "title": {
    "type": "text"
  },
  "url": {
    "type": "wildcard"
  }
}