Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add model classes for renku storing entities in solr #624

Merged
merged 19 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@
"command": "alias k=kubectl"
},
"ghcr.io/devcontainers-contrib/features/rclone:1": {},
"./k3d": {}
"./k3d": {},
"ghcr.io/devcontainers/features/java:1": {
"version": "21",
"jdkDistro": "open"
},
"./solr": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/devcontainers-contrib/features/poetry",
Expand Down
18 changes: 18 additions & 0 deletions .devcontainer/solr/devcontainer-feature.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "solr",
"id": "solr",
"version": "1.0.0",
"description": "A feature adding solr",
"options": {
"solr_version": {
"type": "string",
"description": "solr version to install",
"proposals": ["9.8.0"],
"default": "9.8.0"
}
},
"installsAfter": [
"ghcr.io/devcontainers/features/java",
"ghcr.io/devcontainers-contrib/features/bash-command"
],
}
16 changes: 16 additions & 0 deletions .devcontainer/solr/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh
set -e

echo "Activation SOLR feature"
eikek marked this conversation as resolved.
Show resolved Hide resolved

download_url="https://dlcdn.apache.org/solr/solr/$SOLR_VERSION/solr-$SOLR_VERSION.tgz"

curl -sSL -o solr.tgz "$download_url"
mkdir -p /opt
tar -C /opt -xzf solr.tgz
ln -snf "/opt/solr-$SOLR_VERSION" /opt/solr
ln -snf /opt/solr/bin/solr /usr/local/bin/solr
mkdir -p /opt/solr/server/logs
chmod 777 /opt/solr/server/logs
chmod 777 /opt/solr/bin
chown -R vscode:vscode "/opt/solr-$SOLR_VERSION"
1 change: 1 addition & 0 deletions components/renku_data_services/solr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Client library to solr and mapping for renku entities."""
157 changes: 157 additions & 0 deletions components/renku_data_services/solr/entity_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Defines the entity documents used with Solr."""

from abc import ABC, abstractmethod
from datetime import UTC, datetime
from enum import StrEnum
from typing import Annotated, Any, Self

from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, errors, field_serializer, field_validator
from ulid import ULID

from renku_data_services.authz.models import Visibility
from renku_data_services.base_models.core import Slug
from renku_data_services.solr.solr_client import DocVersion, DocVersions, ResponseBody


def _str_to_slug(value: Any) -> Slug:
if isinstance(value, str):
return Slug.from_name(value)
elif isinstance(value, Slug):
return value
raise errors.ValidationError(message="converting to slug in solr documents was not successful")


class EntityType(StrEnum):
"""The different type of entities available from search."""

project = "Project"
user = "User"
group = "Group"


class EntityDoc(BaseModel, ABC, frozen=True):
"""Base class for entity document models."""

namespace: Annotated[Slug, BeforeValidator(_str_to_slug)]
version: DocVersion = Field(
serialization_alias="_version_",
validation_alias=AliasChoices("version", "_version_"),
default=DocVersions.not_exists(),
)
score: float | None = None

@property
@abstractmethod
def entity_type(self) -> EntityType:
"""Return the type of this entity."""
...
olevski marked this conversation as resolved.
Show resolved Hide resolved

def to_dict(self) -> dict[str, Any]:
"""Return the dict of this group."""
dict = self.model_dump(by_alias=True, exclude_defaults=True)
# note: _kind=fullentity is for being backwards compatible, it might not be needed in the future
dict.update(_type=self.entity_type.value, _kind="fullentity")
return dict

def reset_solr_fields(self) -> Self:
"""Resets fields that are filled by solr when querying."""
return self.model_copy(update={"version": DocVersions.not_exists(), "score": None})


class User(EntityDoc, frozen=True):
"""Represents a renku user in SOLR."""

id: str
firstName: str | None = None
lastName: str | None = None
eikek marked this conversation as resolved.
Show resolved Hide resolved

@property
def entity_type(self) -> EntityType:
eikek marked this conversation as resolved.
Show resolved Hide resolved
"""Return the type of this entity."""
return EntityType.user

@field_serializer("namespace", when_used="always")
def __serialize_namespace(self, namespace: Slug) -> str:
return namespace.value

@classmethod
def from_dict(cls, d: dict[str, Any]) -> "User":
"""Create a User from a dictionary."""
return User.model_validate(d)


class Group(EntityDoc, frozen=True):
"""Represents a renku user in SOLR."""

id: ULID
name: str
description: str | None = None

@property
def entity_type(self) -> EntityType:
"""Return the type of this entity."""
return EntityType.group

@field_serializer("id", when_used="always")
def __serialize_id(self, id: ULID) -> str:
return str(id)

@field_serializer("namespace", when_used="always")
def __serialize_namespace(self, namespace: Slug) -> str:
return namespace.value

@classmethod
def from_dict(cls, d: dict[str, Any]) -> "Group":
"""Create a Group from a dictionary."""
return Group.model_validate(d)


class Project(EntityDoc, frozen=True):
"""Represents a renku project in SOLR."""

id: ULID
name: str
slug: Annotated[Slug, BeforeValidator(_str_to_slug)]
visibility: Visibility
createdBy: str
creationDate: datetime
repositories: list[str] = Field(default_factory=list)
description: str | None = None
keywords: list[str] = Field(default_factory=list)
namespaceDetails: ResponseBody | None = None
creatorDetails: ResponseBody | None = None
olevski marked this conversation as resolved.
Show resolved Hide resolved

@property
def entity_type(self) -> EntityType:
"""Return the type of this entity."""
return EntityType.project

@field_serializer("namespace", when_used="always")
def __serialize_namespace(self, namespace: Slug) -> str:
return namespace.value

@field_serializer("id", when_used="always")
def __serialize_id(self, id: ULID) -> str:
return str(id)

@field_serializer("slug", when_used="always")
def __serialize_slug(self, slug: Slug) -> str:
return slug.value

@field_serializer("visibility", when_used="always")
def __serialize_visibilty(self, visibility: Visibility) -> str:
return visibility.value

@field_serializer("creationDate", when_used="always")
def __serialize_creation_date(self, creationDate: datetime) -> str:
return creationDate.strftime("%Y-%m-%dT%H:%M:%SZ")

@field_validator("creationDate")
@classmethod
def _add_tzinfo(cls, v: datetime) -> datetime:
return v.replace(tzinfo=UTC)

@classmethod
def from_dict(cls, d: dict[str, Any]) -> "Project":
"""Create a Project from a dictionary."""
return Project.model_validate(d)
124 changes: 124 additions & 0 deletions components/renku_data_services/solr/entity_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Defines the solr schema used for the renku entities."""

from typing import Final

from renku_data_services.solr.solr_migrate import SchemaMigration
from renku_data_services.solr.solr_schema import (
AddCommand,
Analyzer,
CopyFieldRule,
Field,
FieldName,
FieldType,
Filters,
SchemaCommand,
Tokenizers,
TypeName,
)


class Fields:
"""A collection of fields."""
eikek marked this conversation as resolved.
Show resolved Hide resolved

created_by: Final[FieldName] = FieldName("createdBy")
creation_date: Final[FieldName] = FieldName("creationDate")
description: Final[FieldName] = FieldName("description")
entity_type: Final[FieldName] = FieldName("_type")
kind: Final[FieldName] = FieldName("_kind")
first_name: Final[FieldName] = FieldName("firstName")
id: Final[FieldName] = FieldName("id")
last_name: Final[FieldName] = FieldName("lastName")
members: Final[FieldName] = FieldName("members")
name: Final[FieldName] = FieldName("name")
repositories: Final[FieldName] = FieldName("repositories")
slug: Final[FieldName] = FieldName("slug")
visibility: Final[FieldName] = FieldName("visibility")
keywords: Final[FieldName] = FieldName("keywords")
namespace: Final[FieldName] = FieldName("namespace")
content_all: Final[FieldName] = FieldName("content_all")
# virtual score field
score: Final[FieldName] = FieldName("score")


class Analyzers:
"""A collection of analyzers."""

text_index: Final[Analyzer] = Analyzer(
tokenizer=Tokenizers.uax29UrlEmail,
filters=[
Filters.lowercase,
Filters.stop,
Filters.english_minimal_stem,
Filters.ascii_folding,
Filters.edgeNgram(2, 8, True),
],
)

text_query: Final[Analyzer] = Analyzer(
tokenizer=Tokenizers.uax29UrlEmail,
filters=[
Filters.lowercase,
Filters.stop,
Filters.english_minimal_stem,
Filters.ascii_folding,
],
)


class FieldTypes:
"""A collection of field types."""

id: Final[FieldType] = FieldType.id(TypeName("SearchId")).make_doc_value()
string: Final[FieldType] = FieldType.str(TypeName("SearchString")).make_doc_value()
text: Final[FieldType] = (
FieldType.text(TypeName("SearchText"))
.with_index_analyzer(Analyzers.text_index)
.with_query_analyzer(Analyzers.text_query)
)
text_all: Final[FieldType] = (
FieldType.text(TypeName("SearchTextAll"))
.with_index_analyzer(Analyzers.text_index)
.with_query_analyzer(Analyzers.text_query)
.make_multi_valued()
)
date_time: Final[FieldType] = FieldType.date_time_point(TypeName("SearchDateTime"))


initial_entity_schema: Final[list[SchemaCommand]] = [
AddCommand(FieldTypes.id),
AddCommand(FieldTypes.string),
AddCommand(FieldTypes.text),
AddCommand(FieldTypes.date_time),
AddCommand(Field.of(Fields.entity_type, FieldTypes.string)),
AddCommand(Field.of(Fields.kind, FieldTypes.string)),
AddCommand(Field.of(Fields.name, FieldTypes.text)),
AddCommand(Field.of(Fields.slug, FieldTypes.string)),
AddCommand(Field.of(Fields.repositories, FieldTypes.string).make_multi_valued()),
AddCommand(Field.of(Fields.visibility, FieldTypes.string)),
AddCommand(Field.of(Fields.description, FieldTypes.text)),
AddCommand(Field.of(Fields.created_by, FieldTypes.id)),
AddCommand(Field.of(Fields.creation_date, FieldTypes.date_time)),
# text all
AddCommand(FieldTypes.text_all),
AddCommand(Field.of(Fields.content_all, FieldTypes.text_all).make_multi_valued()),
AddCommand(CopyFieldRule(source=Fields.name, dest=Fields.content_all)),
AddCommand(CopyFieldRule(source=Fields.description, dest=Fields.content_all)),
AddCommand(CopyFieldRule(source=Fields.slug, dest=Fields.content_all)),
AddCommand(CopyFieldRule(source=Fields.repositories, dest=Fields.content_all)),
# user fields
AddCommand(Field.of(Fields.first_name, FieldTypes.string)),
AddCommand(Field.of(Fields.last_name, FieldTypes.string)),
AddCommand(CopyFieldRule(source=Fields.first_name, dest=Fields.content_all)),
AddCommand(CopyFieldRule(source=Fields.last_name, dest=Fields.content_all)),
# keywords
AddCommand(Field.of(Fields.keywords, FieldTypes.string).make_multi_valued()),
AddCommand(CopyFieldRule(source=Fields.keywords, dest=Fields.content_all)),
# namespace
AddCommand(Field.of(Fields.namespace, FieldTypes.string)),
AddCommand(CopyFieldRule(source=Fields.namespace, dest=Fields.content_all)),
]


all_migrations: Final[list[SchemaMigration]] = [
SchemaMigration(version=9, commands=initial_entity_schema, requires_reindex=True)
]
Loading
Loading