Skip to content

Commit 209eaff

Browse files
committed
add coerce error to handle forwareded messages with different ts format
1 parent 65efbdc commit 209eaff

File tree

2 files changed

+23
-10
lines changed

2 files changed

+23
-10
lines changed

app/utils/file_parser.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@ def parse_whatsapp_chat(file):
1313
timestamp = match.group(1).strip('[]')
1414
sender = match.group(2)
1515
message = match.group(3)
16-
# Check if the message is not a placeholder for media
16+
17+
# Check if the message is not a placeholder for media; might need to additional strings for stickers, calls, etc.
1718
if message not in ["audio omitted", "image omitted", "video omitted"]:
18-
19-
timestamp = timestamp.replace(' a.m.', ' AM').replace(' p.m.', ' PM').replace('a.m.', ' AM').replace('p.m.', ' PM')
19+
20+
# Normalize different AM/PM representations
21+
timestamp = timestamp.replace(' a.m.', ' AM').replace(' p.m.', ' PM')
22+
timestamp = timestamp.replace('a. m.', ' AM').replace('p. m.', ' PM')
23+
timestamp = timestamp.replace('a.m.', ' AM').replace('p.m.', ' PM')
2024

2125
# List of possible date formats to handle various cases
2226
date_formats = [
@@ -27,18 +31,27 @@ def parse_whatsapp_chat(file):
2731
'%d/%m/%y, %I:%M:%S %p', # 12/08/24, 08:57:27 AM
2832
'%d/%m/%y, %H:%M:%S', # 23/05/24, 21:44:49 (24-hour format)
2933
'%d/%m/%Y, %H:%M:%S', # 23/05/2024, 21:44:49 (24-hour format with full year)
30-
'%m/%d/%Y, %I:%M:%S %p' # 08/12/2024, 8:57:27 PM
34+
'%m/%d/%Y, %I:%M:%S %p', # 08/12/2024, 8:57:27 PM
35+
'%H:%M, %d/%m/%Y', # 10:03, 12/3/2024
36+
'%H:%M, %m/%d/%Y', # 10:03, 3/12/2024 (US format)
37+
'%H:%M, %d/%m/%y', # 10:03, 12/3/24 (short year format)
38+
'%H:%M, %m/%d/%y', # 10:03, 3/12/24 (short year, US format)
39+
'%I:%M %p, %d/%m/%Y', # 0:28 PM, 22/8/2022 (Handling AM/PM format with day/month/year)
40+
'%I:%M %p, %m/%d/%Y', # 0:28 PM, 8/22/2022 (US format)
41+
'%I:%M %p, %d/%m/%y', # 0:28 PM, 22/8/22 (short year format)
42+
'%I:%M %p, %m/%d/%y', # 0:28 PM, 8/22/22 (short year, US format)
43+
'%I:%M %p, %d/%m/%Y', # 0:28 p.m., 22/8/2022
3144
]
3245

3346
for date_format in date_formats:
3447
try:
3548
date_time_obj = datetime.strptime(timestamp, date_format)
3649
break
3750
except ValueError:
38-
continue
39-
else:
40-
# If no format matched, raise an error or handle it accordingly
41-
raise ValueError(f"Timestamp format not recognized: {timestamp}")
51+
date_time_obj = None
52+
53+
if date_time_obj is None:
54+
continue
4255

4356
data.append([date_time_obj, sender, message])
4457

tests/test_parse_whatsapp_chat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ def test_parse_with_media_omitted(self):
3434

3535
def test_invalid_date_format(self):
3636
chat_data = b"[99/99/99, 03:45:12 p.m.] John: Hello!"
37-
with self.assertRaises(ValueError):
38-
parse_whatsapp_chat(BytesIO(chat_data))
37+
df = parse_whatsapp_chat(BytesIO(chat_data))
38+
self.assertEqual(len(df), 0) # Should return an empty DataFrame
3939

4040
def test_24_hour_format(self):
4141
chat_data = b"[23/05/2024 21:44:49] Alice: Good evening!\n[23/05/2024 08:30:15] Bob: Good morning!"

0 commit comments

Comments
 (0)