diff --git a/city_scrapers/spiders/pitt_sports.py b/city_scrapers/spiders/pitt_sports.py new file mode 100644 index 00000000..e6ef479f --- /dev/null +++ b/city_scrapers/spiders/pitt_sports.py @@ -0,0 +1,97 @@ +import re +from datetime import datetime + +from city_scrapers_core.constants import BOARD +from city_scrapers_core.items import Meeting +from city_scrapers_core.spiders import CityScrapersSpider + + +class PittSportsSpider(CityScrapersSpider): + name = "pitt_sports" + agency = "Pittsburgh Sports & Exhibition Authority" + timezone = "America/New_York" + start_urls = ["http://www.pgh-sea.com/index.php?path=info-meet-sea"] + + def parse(self, response): + """ + `parse` should always `yield` Meeting items. + + Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping + needs. + """ + for item in response.css("tr:not([bgcolor])"): + meeting = Meeting( + title="SEA Board Meeting", + description=self._parse_description(item), + classification=BOARD, + start=self._parse_start(item), + end=self._parse_end(item), + all_day=self._parse_all_day(item), + time_notes=self._parse_time_notes(item), + location=self._parse_location(item), + links=self._parse_links(item), + source=self._parse_source(response), + ) + meeting["status"] = self._get_status(meeting) + meeting["id"] = self._get_id(meeting) + + yield meeting + + def _parse_description(self, item): + """Parse or generate meeting description.""" + description = item.css("strong::text").get() + if description: + return description + return "" + + def _parse_start(self, item): + """Parse start datetime as a naive datetime object.""" + meeting_info = item.css("td::text").getall() + schedule_date_time_pre = meeting_info[0] + if meeting_info[1] == "Rescheduled": + schedule_date_time_pre = meeting_info[2] + if not ":" in schedule_date_time_pre: + schedule_date_time_pre += " -- 10:30AM" + schedule_date_time = re.sub( + r"\s?([ap]m)", + lambda match: r"{}".format(match.group(1).upper()), + schedule_date_time_pre, + ) + return datetime.strptime(schedule_date_time, "%a, %b %d, %Y -- %I:%M%p") + + def _parse_end(self, item): + """Parse end datetime as a naive datetime object. Added by pipeline if None""" + return None + + def _parse_time_notes(self, item): + """Parse any additional notes on the timing of the meeting""" + return "" + + def _parse_all_day(self, item): + """Parse or generate all-day status. Defaults to False.""" + return False + + def _parse_location(self, item): + """Parse or generate location.""" + meeting_info = item.css("td::text").getall() + meeting_location = meeting_info[-1] + if "DLCC" in meeting_location: + return { + "address": "1000 Fort Duquesne Blvd, Pittsburgh, PA 15222", + "name": meeting_location, + } + return { + "address": "", + "name": meeting_location, + } + + def _parse_links(self, item): + """Parse or generate links.""" + agenda_link = item.css("a::attr(href)").get() + if agenda_link and ("http" in agenda_link): + return [{"href": agenda_link, "title": "agenda"}] + return [{"href": "", "title": ""}] + + def _parse_source(self, response): + """Parse or generate source.""" + return response.url diff --git a/tests/files/pitt_sports.html b/tests/files/pitt_sports.html new file mode 100644 index 00000000..96f03da0 --- /dev/null +++ b/tests/files/pitt_sports.html @@ -0,0 +1,475 @@ + + + + + + + + + Pittsburgh SEA + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + + + +
+ + + + + + + +
+
+
+
    +
  • +
    + North Shore +
     
    +
    + +
    +
    +
  • +
  • +
    + Stadium +
     
    +
    + +
    +
    +
  • +
  • +
    + SEA +
     
    +
    + +
    +
    +
  • +
+
+
+
+ + + + + +
+
+ +
+
+ +
+
+
+ +
+ + + + +
+ + +
+ +
+
+

SEA Board Meetings

+
+
+ +
+ SEA Board Meetings are typically held on the second Thursday of each month at the David L. Lawrence Convention Center at 10:30am. Meeting changes will be advertised in a newspaper of general circulation at least 24 hours prior to the scheduled meeting. +

+ Public participation at open meetings is a statutory right accorded to residents and taxpayers of Allegheny County. To assure that persons who wish to appear before the Board may be heard, the SEA Board will encourage public participation according to the terms of this policy (click to view policy). +

+ For SEA Board Meeting minutes or questions, please call 412-393-0200. +

+
+ +
+ 2011 .. + 2012 .. + 2013 .. + 2014 .. + 2015 .. + 2016 .. + 2017 .. + 2018 .. + 2019 .. + 2020 +

+
+ +
+ + + + + + + + + +
DATEAGENDAMINUTES / OTHER
Thu, Jan 09, 2020
Cancelled
DLCC222


Thu, Feb 13, 2020 -- 10:00am
DLCC
Agenda
Minutes
Thu, Mar 12, 2020
Cancelled
DLCC


Thu, Apr 09, 2020 -- 10:30 am
Remote
Agenda
Minutes
Thu, May 14, 2020 -- 10:30 am
Remote
Agenda
Minutes
Thu, Jun 11, 2020
Remote
Agenda
Minutes
Thu, Jul 09, 2020
Remote
Agenda
Minutes
Thu, Aug 13, 2020
Remote
Agenda
Minutes
Thu, Sep 10, 2020
Cancelled
Remote


Thu, Oct 08, 2020
Remote
Agenda
Minutes
Thu, Nov 12, 2020
Remote
Agenda
Statement
Thu, Dec 10, 2020
Rescheduled
Thu, Dec 17, 2020 -- 11:00am
Remote

December 17, 2020 SEA Board Statement
+ +
+
+
+
+ + + +
+
+
+
+ Home +    |    + Privacy Policy +    |    + Contact Us +
+
+ © Pittsburgh SEA - 2019 +
+
+
+
+ + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_pitt_sports.py b/tests/test_pitt_sports.py new file mode 100644 index 00000000..ac5076c4 --- /dev/null +++ b/tests/test_pitt_sports.py @@ -0,0 +1,65 @@ +from datetime import datetime +from os.path import dirname, join + +import pytest +from city_scrapers_core.constants import BOARD +from city_scrapers_core.utils import file_response +from freezegun import freeze_time + +from city_scrapers.spiders.pitt_sports import PittSportsSpider + +test_response = file_response( + join(dirname(__file__), "files", "pitt_sports.html"), + url="http://www.pgh-sea.com/index.php?path=info-meet-sea", +) +spider = PittSportsSpider() + +freezer = freeze_time("2020-12-11") +freezer.start() + +parsed_items = [item for item in spider.parse(test_response)] + +freezer.stop() + + +def test_title(): + assert parsed_items[0]["title"] == "SEA Board Meeting" + + +def test_description(): + assert parsed_items[0]["description"] == "Cancelled" + + +def test_start(): + assert parsed_items[0]["start"] == datetime(2020, 1, 9, 10, 30) + + +def test_status(): + assert parsed_items[0]["status"] == "cancelled" + + +def test_location(): + assert parsed_items[0]["location"] == { + "name": "DLCC222", + "address": "1000 Fort Duquesne Blvd, Pittsburgh, PA 15222", + } + + +def test_source(): + assert ( + parsed_items[0]["source"] + == "http://www.pgh-sea.com/index.php?path=info-meet-sea" + ) + + +def test_links(): + assert parsed_items[0]["links"] == [{"href": "", "title": ""}] + + +def test_classification(): + assert parsed_items[0]["classification"] == BOARD + + +@pytest.mark.parametrize("item", parsed_items) +def test_all_day(item): + assert item["all_day"] is False