From feabf56678ccc947014adc03d91b506a322a477f Mon Sep 17 00:00:00 2001 From: fmk17 <fmk17@inf.ufpr.br> Date: Tue, 25 Apr 2023 14:41:49 -0300 Subject: [PATCH] Update crawler --- crawler.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index 6af929f..536a696 100644 --- a/crawler.py +++ b/crawler.py @@ -17,7 +17,7 @@ LocationDays = namedtuple("LocationDays", "days, location, update_datetime") cached_update_times: Dict[Location, datetime] = dict() cached_responses: Dict[Location, LocationDays] = dict() -DATE_REGEX = r": (\d{1,2})\/(\d{1,2})\/(\d{4})" +DATE_REGEX = r"(\d{1,2})\/(\d{1,2})\/(\d{2,4})" logger = logging.getLogger("crawler") @@ -34,11 +34,15 @@ def get_location_days( ): return cached_responses[location] - response = requests.get(location.url) + response = requests.get(location.url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:107.0) Gecko/20100101 Firefox/107.0'}) soup = BeautifulSoup(response.text, "lxml") post = soup.select_one("#post div:nth-child(3)") - post_children = iter(post.children) + grouper = post.select_one('.wp-block-group__inner-container') + if grouper is not None: + post_children = iter(grouper.children) + else: + post_children = iter(post.children) post_children = (node for node in post_children if node.text.strip() != "") post_children = dropwhile( lambda n: not re.search(DATE_REGEX, n.text), post_children @@ -52,6 +56,8 @@ def get_location_days( if date_re is None: break d, m, y = map(int, date_re.groups()) + if y < 100: + y += 2000 table_children = iter(table_node.select("td")) menus = [] for title_node, item_nodes in zip(table_children, table_children): @@ -83,14 +89,19 @@ def get_location_days( if indicator is not None: item_indicators.add(indicator) elif child.text.strip() != "": - item_name = child.text.strip() + if item_name is not None: + item_name += " " + child.text.strip() + else: + item_name = child.text.strip() if item_name is not None: items.append( MenuItem( name=item_name, indicators=frozenset(item_indicators) ) ) - menus.append(Menu(meal_name=title_node.text, items=tuple(items))) + menus.append( + Menu(meal_name=title_node.text.strip(), items=tuple(items)) + ) days.append(Day(date=date(y, m, d), date_raw=date_text, menus=menus)) cached_update_times[location] = datetime.now() -- GitLab