Skip to content

Commit

Permalink
add Feed > Post endpoints again
Browse files Browse the repository at this point in the history
  • Loading branch information
fqrious committed Jan 7, 2025
1 parent ab6f45a commit d6ceb0b
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 54 deletions.
113 changes: 60 additions & 53 deletions obstracts/server/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,31 +184,6 @@ def get_markdown(cls, request, md_text, images_qs: 'models.models.BaseManager[mo
"""
),
),
create_posts=extend_schema(
request=serializers.PostCreateSerializer,
responses={201:JobSerializer, 404: api_schema.DEFAULT_404_ERROR, 400: api_schema.DEFAULT_400_ERROR},
summary="Backfill a Post into A Feed",
description=textwrap.dedent(
"""
This endpoint allows you to add Posts manually to a Feed. This endpoint is designed to ingest posts that are not identified by the Wayback Machine (used by the POST Feed endpoint during ingestion). If the feed you want to add a post to does not already exist, you should first add it using the POST Feed endpoint.
The following key/values are accepted in the body of the request:
* `profile_id` (required): a valid profile ID to define how the post should be processed.
* `link` (required - must be unique): The URL of the blog post. This is where the content of the post is found. It cannot be the same as the `url` of a post already in this feed. If you want to update the post, use the PATCH post endpoint.
* `pubdate` (required): The date of the blog post in the format `YYYY-MM-DD`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually.
* `title` (required): history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually.
* `author` (optional): the value to be stored for the author of the post.
* `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`.
Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` (history4feed) and the value used `<FEED_ID>+<POST_URL>+<POST_PUB_TIME (to .000000Z)>` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`).
The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
_Note: We do have a proof-of-concept to scrape a site for all blog post urls, titles, and pubdate called [sitemap2posts](https://github.com/muchdogesec/sitemap2posts) which can help form the request body needed for this endpoint._
"""
),
),
)
class FeedView(viewsets.ViewSet):
lookup_url_kwarg = "feed_id"
Expand Down Expand Up @@ -362,22 +337,6 @@ def fetch(self, request, *args, **kwargs):
job = tasks.new_task(out, s.validated_data['profile_id'])
return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED)
return resp

@decorators.action(detail=True, methods=["POST"], url_path='posts')
def create_posts(self, request, *args, **kwargs):
request_body = request.body
s = serializers.FetchFeedSerializer(data=request.data)
s.is_valid(raise_exception=True)

resp = FeedView.make_request(
request, f"/api/v1/feeds/{kwargs.get(FeedView.lookup_url_kwarg)}/posts/", request_body=request_body
)
if resp.status_code == 201:
out = json.loads(resp.content)
out['job_id'] = out['id']
job = tasks.new_post_patch_task(out, s.validated_data["profile_id"])
return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED)
return resp



Expand Down Expand Up @@ -473,6 +432,7 @@ class PostOnlyView(viewsets.ViewSet):
ordering_fields = ["pubdate", "title"]
ordering = ["-pubdate"]
minmax_date_fields = ["pubdate"]
h4f_base_path = "/api/v1/posts"

class filterset_class(FilterSet):
feed_id = filters.BaseInFilter(help_text="filter by one or more `feed_id`(s)")
Expand All @@ -491,20 +451,20 @@ class filterset_class(FilterSet):
job_id = Filter(help_text="Filter the Post by Job ID the Post was downloaded in.")


def list(self, request, *args, feed_id=None, **kwargs):
url = f"/api/v1/posts/"
def list(self, request, *args, **kwargs):
url = self.h4f_base_path + "/"
return self.add_obstract_props(FeedView.make_request(
request, url
))

def retrieve(self, request, *args, feed_id=None, post_id=None):
url = f"/api/v1/posts/{post_id}/"
def retrieve(self, request, *args, post_id=None, **kwargs):
url = f"{self.h4f_base_path}/{post_id}/"
return self.add_obstract_props(FeedView.make_request(
request, url
))

def partial_update(self, request, *args, feed_id=None, post_id=None):
url = f"/api/v1/posts/{post_id}/"
def partial_update(self, request, *args, post_id=None, **kwargs):
url = f"{self.h4f_base_path}/{post_id}/"

return self.add_obstract_props(FeedView.make_request(
request, url
Expand All @@ -528,9 +488,9 @@ def get_providers(ids):
d.update(id_provider_map.get(d['id'], {}))
return Response(data, status=response.status_code)

def destroy(self, request, *args, post_id=None):
def destroy(self, request, *args, post_id=None, **kwargs):
resp = FeedView.make_request(
request, f"/api/v1/posts/{post_id}/"
request, f"{self.h4f_base_path}/{post_id}/"
)
if resp.status_code != 204:
return resp
Expand All @@ -547,7 +507,7 @@ def fetch(self, request, *args, **kwargs):
s.is_valid(raise_exception=True)
post_id = kwargs.get(self.lookup_url_kwarg)
resp = FeedView.make_request(
request, f"/api/v1/posts/{post_id}/", request_body=request_body
request, f"{self.h4f_base_path}/{post_id}/", request_body=request_body
)
if resp.status_code == 201:
self.remove_report(post_id)
Expand Down Expand Up @@ -577,7 +537,7 @@ def fetch(self, request, *args, **kwargs):
),
)
@decorators.action(detail=True, methods=["GET"])
def objects(self, request, feed_id=None, post_id=None):
def objects(self, request, post_id=None, **kwargs):
return self.get_post_objects(post_id)

def get_post_objects(self, post_id):
Expand Down Expand Up @@ -652,7 +612,7 @@ def get_post_objects(self, post_id):
],
)
@decorators.action(detail=True, methods=["GET"])
def markdown(self, request, post_id=None):
def markdown(self, request, post_id=None, **kwargs):
obj = get_object_or_404(models.File, post_id=post_id)
resp_text = MarkdownImageReplacer.get_markdown(request, obj.markdown_file.read().decode(), models.FileImage.objects.filter(report__post_id=post_id))
return FileResponse(streaming_content=resp_text, content_type='text/markdown', filename='markdown.md')
Expand All @@ -668,7 +628,7 @@ def markdown(self, request, post_id=None):
),
)
@decorators.action(detail=True, pagination_class=Pagination("images"))
def images(self, request, post_id=None, image=None):
def images(self, request, post_id=None, image=None, **kwargs):
queryset = models.FileImage.objects.filter(report__post_id=post_id).order_by('name')
paginator = Pagination('images')

Expand Down Expand Up @@ -698,6 +658,53 @@ def remove_report(self, post_id):
except Exception as e:
logging.exception("remove_report failed")

@extend_schema_view(
create_posts=extend_schema(
request=serializers.PostCreateSerializer,
responses={201:JobSerializer, 404: api_schema.DEFAULT_404_ERROR, 400: api_schema.DEFAULT_400_ERROR},
summary="Backfill a Post into A Feed",
description=textwrap.dedent(
"""
This endpoint allows you to add Posts manually to a Feed. This endpoint is designed to ingest posts that are not identified by the Wayback Machine (used by the POST Feed endpoint during ingestion). If the feed you want to add a post to does not already exist, you should first add it using the POST Feed endpoint.
The following key/values are accepted in the body of the request:
* `profile_id` (required): a valid profile ID to define how the post should be processed.
* `link` (required - must be unique): The URL of the blog post. This is where the content of the post is found. It cannot be the same as the `url` of a post already in this feed. If you want to update the post, use the PATCH post endpoint.
* `pubdate` (required): The date of the blog post in the format `YYYY-MM-DD`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually.
* `title` (required): history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually.
* `author` (optional): the value to be stored for the author of the post.
* `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`.
Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` (history4feed) and the value used `<FEED_ID>+<POST_URL>+<POST_PUB_TIME (to .000000Z)>` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`).
The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
_Note: We do have a proof-of-concept to scrape a site for all blog post urls, titles, and pubdate called [sitemap2posts](https://github.com/muchdogesec/sitemap2posts) which can help form the request body needed for this endpoint._
"""
),
),
)
class FeedPostView(PostOnlyView):
openapi_tags = [ "Feeds"]
@property
def h4f_base_path(self):
return f"/api/v1/feeds/{self.kwargs['feed_id']}/posts"

def create(self, request, *args, **kwargs):
request_body = request.body
s = serializers.FetchFeedSerializer(data=request.data)
s.is_valid(raise_exception=True)

resp = FeedView.make_request(
request, f"/api/v1/feeds/{kwargs.get(FeedView.lookup_url_kwarg)}/posts/", request_body=request_body
)
if resp.status_code == 201:
out = json.loads(resp.content)
out['job_id'] = out['id']
job = tasks.new_post_patch_task(out, s.validated_data["profile_id"])
return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED)
return resp

@extend_schema_view(
list=extend_schema(
Expand Down
1 change: 1 addition & 0 deletions obstracts/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def handler500(*args, **kwargs):

router = routers.SimpleRouter(use_regex_path=False)
router.register('profiles', ProfileView, 'profile-view')
router.register("feeds/<uuid:feed_id>/posts", views.FeedPostView, "feed-post-view")
router.register('feeds', views.FeedView, "feed-view")

router.register("posts", views.PostOnlyView, "post-view")
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dataclasses-json==0.6.7; python_version >= '3.7' and python_version < '4.0'
deprecated==1.2.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
dirtyjson==1.0.8
distro==1.9.0; python_version >= '3.6'
django==5.0.9; python_version >= '3.10'
django==5.0.10; python_version >= '3.10'
django-cors-headers==4.4.0;
django-filter==24.2; python_version >= '3.8'
django-restframework==0.0.1
Expand Down

0 comments on commit d6ceb0b

Please sign in to comment.