Skip to content

Commit ee275c8

Browse files
committed
fix date parse format error
1 parent b112946 commit ee275c8

File tree

2 files changed

+61
-6
lines changed

2 files changed

+61
-6
lines changed

app/utils/file_parser.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import pandas as pd
33
from datetime import datetime
44

5-
65
def parse_whatsapp_chat(file):
76
pattern = r'(\[.*?\]) (.*?): (.*)'
87
lines = file.read().decode("utf-8").splitlines()
@@ -14,12 +13,27 @@ def parse_whatsapp_chat(file):
1413
timestamp = match.group(1).strip('[]')
1514
sender = match.group(2)
1615
message = match.group(3)
17-
#Noticed that whatsapp uses the list below as placeholders for media.
16+
# Check if the message is not a placeholder for media
1817
if message not in ["audio omitted", "image omitted", "video omitted"]:
19-
try:
20-
date_time_obj = datetime.strptime(timestamp, '%m/%d/%y, %I:%M:%S %p')
21-
except ValueError:
22-
date_time_obj = datetime.strptime(timestamp, '%d/%m/%y, %I:%M:%S %p')
18+
# Normalize the timestamp by replacing "a.m." and "p.m." with "AM" and "PM"
19+
timestamp = timestamp.replace(' a.m.', ' AM').replace(' p.m.', ' PM').replace('a.m.', ' AM').replace('p.m.', ' PM')
20+
21+
# List of possible date formats to handle various cases
22+
date_formats = [
23+
'%m/%d/%y, %I:%M:%S %p',
24+
'%d/%m/%y, %I:%M:%S %p'
25+
]
26+
27+
for date_format in date_formats:
28+
try:
29+
date_time_obj = datetime.strptime(timestamp, date_format)
30+
break
31+
except ValueError:
32+
continue
33+
else:
34+
# If no format matched, raise an error or handle it accordingly
35+
raise ValueError(f"Timestamp format not recognized: {timestamp}")
36+
2337
data.append([date_time_obj, sender, message])
2438

2539
df = pd.DataFrame(data, columns=['Timestamp', 'Sender', 'Message'])

tests/test_parse_whatsapp_chat.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import unittest
2+
import pandas as pd
3+
from io import BytesIO
4+
from app.utils.file_parser import parse_whatsapp_chat
5+
6+
class TestParseWhatsappChat(unittest.TestCase):
7+
8+
def test_parse_standard_format(self):
9+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: Hello!\n[12/01/23, 03:46:12 p.m.] Jane: Hi, how are you?"
10+
df = parse_whatsapp_chat(BytesIO(chat_data))
11+
self.assertEqual(len(df), 2)
12+
self.assertEqual(df.iloc[0]['Sender'], 'John')
13+
self.assertEqual(df.iloc[1]['Sender'], 'Jane')
14+
15+
def test_parse_non_standard_format(self):
16+
chat_data = b"[01/12/23, 03:45:12 p.m.] John: Hello!\n[01/12/23, 03:46:12 p.m.] Jane: Hi, how are you?"
17+
df = parse_whatsapp_chat(BytesIO(chat_data))
18+
self.assertEqual(len(df), 2)
19+
self.assertEqual(df.iloc[0]['Sender'], 'John')
20+
self.assertEqual(df.iloc[1]['Sender'], 'Jane')
21+
22+
def test_parse_day_month_format(self):
23+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: Hello!\n[12/01/23, 03:46:12 p.m.] Jane: Hi, how are you?"
24+
df = parse_whatsapp_chat(BytesIO(chat_data))
25+
self.assertEqual(len(df), 2)
26+
self.assertEqual(df.iloc[0]['Sender'], 'John')
27+
self.assertEqual(df.iloc[1]['Sender'], 'Jane')
28+
29+
def test_parse_with_media_omitted(self):
30+
chat_data = b"[12/01/23, 03:45:12 p.m.] John: audio omitted\n[12/01/23, 03:46:12 p.m.] Jane: Hi, how are you?"
31+
df = parse_whatsapp_chat(BytesIO(chat_data))
32+
self.assertEqual(len(df), 1) # One message should be skipped
33+
self.assertEqual(df.iloc[0]['Sender'], 'Jane')
34+
35+
def test_invalid_date_format(self):
36+
chat_data = b"[99/99/99, 03:45:12 p.m.] John: Hello!"
37+
with self.assertRaises(ValueError):
38+
parse_whatsapp_chat(BytesIO(chat_data))
39+
40+
if __name__ == '__main__':
41+
unittest.main()

0 commit comments

Comments
 (0)