Skip to content

Commit 65efbdc

Browse files
authored
Merge pull request #4 from cbaezp/3-date-parsing-error
add additional date parsing formats
2 parents 253c109 + dd8560e commit 65efbdc

File tree

2 files changed

+41
-3
lines changed

2 files changed

+41
-3
lines changed

app/utils/file_parser.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,19 @@ def parse_whatsapp_chat(file):
1515
message = match.group(3)
1616
# Check if the message is not a placeholder for media
1717
if message not in ["audio omitted", "image omitted", "video omitted"]:
18-
# Normalize the timestamp by replacing "a.m." and "p.m." with "AM" and "PM"
18+
1919
timestamp = timestamp.replace(' a.m.', ' AM').replace(' p.m.', ' PM').replace('a.m.', ' AM').replace('p.m.', ' PM')
2020

2121
# List of possible date formats to handle various cases
2222
date_formats = [
23-
'%m/%d/%y, %I:%M:%S %p',
24-
'%d/%m/%y, %I:%M:%S %p'
23+
'%m/%d/%y, %I:%M:%S %p', # 08/12/24, 8:57:27 PM
24+
'%d/%m/%y, %I:%M:%S %p', # 12/08/24, 8:57:27 PM
25+
'%d/%m/%Y %H:%M:%S', # 23/05/2024 21:44:49 (24-hour format)
26+
'%m/%d/%y, %I:%M:%S %p', # 08/12/24, 08:57:27 AM
27+
'%d/%m/%y, %I:%M:%S %p', # 12/08/24, 08:57:27 AM
28+
'%d/%m/%y, %H:%M:%S', # 23/05/24, 21:44:49 (24-hour format)
29+
'%d/%m/%Y, %H:%M:%S', # 23/05/2024, 21:44:49 (24-hour format with full year)
30+
'%m/%d/%Y, %I:%M:%S %p' # 08/12/2024, 8:57:27 PM
2531
]
2632

2733
for date_format in date_formats:

tests/test_parse_whatsapp_chat.py

+32
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,37 @@ def test_invalid_date_format(self):
3737
with self.assertRaises(ValueError):
3838
parse_whatsapp_chat(BytesIO(chat_data))
3939

40+
def test_24_hour_format(self):
41+
chat_data = b"[23/05/2024 21:44:49] Alice: Good evening!\n[23/05/2024 08:30:15] Bob: Good morning!"
42+
df = parse_whatsapp_chat(BytesIO(chat_data))
43+
self.assertEqual(len(df), 2)
44+
self.assertEqual(df.iloc[0]['Sender'], 'Alice')
45+
self.assertEqual(df.iloc[1]['Sender'], 'Bob')
46+
47+
#not expected; but just in case
48+
def test_mixed_formats(self):
49+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: Hello!\n[23/05/2024 21:44:49] Jane: How's it going?"
50+
df = parse_whatsapp_chat(BytesIO(chat_data))
51+
self.assertEqual(len(df), 2)
52+
self.assertEqual(df.iloc[0]['Sender'], 'John')
53+
self.assertEqual(df.iloc[1]['Sender'], 'Jane')
54+
55+
def test_omitted_media_multiple_entries(self):
56+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: audio omitted\n[12/01/23, 03:46:12 p.m.] Jane: image omitted\n[12/01/23, 03:47:12 p.m.] John: How are you?"
57+
df = parse_whatsapp_chat(BytesIO(chat_data))
58+
self.assertEqual(len(df), 1) # Two messages should be skipped
59+
self.assertEqual(df.iloc[0]['Sender'], 'John')
60+
self.assertEqual(df.iloc[0]['Message'], 'How are you?')
61+
62+
def test_case_with_empty_file(self):
63+
chat_data = b""
64+
df = parse_whatsapp_chat(BytesIO(chat_data))
65+
self.assertEqual(len(df), 0) # DataFrame should be empty
66+
67+
def test_case_with_only_media_omitted(self):
68+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: audio omitted\n[12/01/23, 03:46:12 p.m.] Jane: video omitted"
69+
df = parse_whatsapp_chat(BytesIO(chat_data))
70+
self.assertEqual(len(df), 0) # DataFrame should be empty as both messages are media omitted
71+
4072
if __name__ == '__main__':
4173
unittest.main()

0 commit comments

Comments
 (0)