From feabf56678ccc947014adc03d91b506a322a477f Mon Sep 17 00:00:00 2001
From: fmk17 <fmk17@inf.ufpr.br>
Date: Tue, 25 Apr 2023 14:41:49 -0300
Subject: [PATCH] Update crawler

---
 crawler.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/crawler.py b/crawler.py
index 6af929f..536a696 100644
--- a/crawler.py
+++ b/crawler.py
@@ -17,7 +17,7 @@ LocationDays = namedtuple("LocationDays", "days, location, update_datetime")
 cached_update_times: Dict[Location, datetime] = dict()
 cached_responses: Dict[Location, LocationDays] = dict()
 
-DATE_REGEX = r": (\d{1,2})\/(\d{1,2})\/(\d{4})"
+DATE_REGEX = r"(\d{1,2})\/(\d{1,2})\/(\d{2,4})"
 logger = logging.getLogger("crawler")
 
 
@@ -34,11 +34,15 @@ def get_location_days(
     ):
         return cached_responses[location]
 
-    response = requests.get(location.url)
+    response = requests.get(location.url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:107.0) Gecko/20100101 Firefox/107.0'})
     soup = BeautifulSoup(response.text, "lxml")
 
     post = soup.select_one("#post div:nth-child(3)")
-    post_children = iter(post.children)
+    grouper = post.select_one('.wp-block-group__inner-container')
+    if grouper is not None:
+        post_children = iter(grouper.children)
+    else:
+        post_children = iter(post.children)
     post_children = (node for node in post_children if node.text.strip() != "")
     post_children = dropwhile(
         lambda n: not re.search(DATE_REGEX, n.text), post_children
@@ -52,6 +56,8 @@ def get_location_days(
         if date_re is None:
             break
         d, m, y = map(int, date_re.groups())
+        if y < 100:
+            y += 2000
         table_children = iter(table_node.select("td"))
         menus = []
         for title_node, item_nodes in zip(table_children, table_children):
@@ -83,14 +89,19 @@ def get_location_days(
                     if indicator is not None:
                         item_indicators.add(indicator)
                 elif child.text.strip() != "":
-                    item_name = child.text.strip()
+                    if item_name is not None:
+                        item_name += " " + child.text.strip()
+                    else:
+                        item_name = child.text.strip()
             if item_name is not None:
                 items.append(
                     MenuItem(
                         name=item_name, indicators=frozenset(item_indicators)
                     )
                 )
-            menus.append(Menu(meal_name=title_node.text, items=tuple(items)))
+            menus.append(
+                Menu(meal_name=title_node.text.strip(), items=tuple(items))
+            )
         days.append(Day(date=date(y, m, d), date_raw=date_text, menus=menus))
 
     cached_update_times[location] = datetime.now()
-- 
GitLab