Skip to content

Commit

Permalink
Merge pull request #47 from szegedai/duration_parser
Browse files Browse the repository at this point in the history
Duration parser
  • Loading branch information
nsomabalint authored Jul 15, 2024
2 parents d77b5d3 + 2ceb8d5 commit 323673a
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 2 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,30 @@ text2datetime('péntek', now=datetime(2023, 6, 7), search_scope=SearchScopes.NOT
# 'end_date': datetime.datetime(2023, 6, 9, 23, 59, 59)}]
```

### Duration Parsing

The duration parser can extract the duration in minutes from various expressions found in sentences.

#### Recognized Formats

The parser is capable of understanding a variety of duration expressions. Here are the primary formats it recognizes:

- Hour and Minute Combination:
- Examples: `1 óra 45 perc`, `egy óra 30 perc`, `2 óra 15 perc`
- Hour Only:
- Examples: `1 óra`, `egy óra`, `2 órát`, `3,5 óra`
- Quarter Hour Phrases:
- Examples: `háromnegyed óra`, `egy és negyed óra`, `kettő és fél óra`


```python
from hun_date_parser import parse_duration

print(parse_duration('45 perc')) # Output: 45

print(parse_duration('1 és negyed óra')) # Output: 75
```

### Datetime to text

The library is also capable of turning datetime objects into their Hungarian text representation.
Expand Down
6 changes: 4 additions & 2 deletions hun_date_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from hun_date_parser.date_textualizer.datetime_textualizer import DatetimeTextualizer, datetime2text
from hun_date_parser.date_parser.datetime_extractor import DatetimeExtractor, text2datetime, text2date, text2time
from hun_date_parser.duration_parser.duration_parsers import parse_duration

__all__ = ["DatetimeTextualizer", "DatetimeExtractor", "datetime2text", "text2datetime", "text2date", "text2time"]
__all__ = ["DatetimeTextualizer", "DatetimeExtractor", "datetime2text", "text2datetime", "text2date", "text2time",
"parse_duration"]

__version__ = "0.2.8"
__version__ = "0.2.9"
5 changes: 5 additions & 0 deletions hun_date_parser/date_parser/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,8 @@

# kezdo: X ...: Y
R_START_STATED_END_IMPLIED = r"(?:[kK]ezd|[iI]ndul).{1,20}:\s*([^:]{1,50}?)\s*:\s*([^:]{1,50})$"

# Duration
R_HOUR_MIN_D = r'.*?(?:([0-9]{1,2}|nulla|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z|tizenegy|tizenkett[oő]|tizenh[aá]rom|tizenn[eé]gy|tizen[oö]t|tizenhat|tizenh[eé]t|tizennyolc|tizenkilenc|h[uú]sz|huszonegy|huszonkett[oöő]|huszonkettő|huszonhárom) [óo]ra)? ?([a-zA-Z0-9]{1,15}) perc'
R_HOUR_D = r'.*?(?:([0-9]{1,2}|nulla|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z|tizenegy|tizenkett[oő]|tizenh[aá]rom|tizenn[eé]gy|tizen[oö]t|tizenhat|tizenh[eé]t|tizennyolc|tizenkilenc|h[uú]sz|huszonegy|huszonkett[oöő]|huszonkettő|huszonhárom|[0-9],5)) ?[óo]r[aá]?'
R_HOUR_HOUR_D = r'.*?(?:([0-9]{1,2}|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z)?(?: [eé]s )? ?(h[aá]romnegyed|negyed|f[eé]l) ?[oó]r[aá])'
Empty file.
78 changes: 78 additions & 0 deletions hun_date_parser/duration_parser/duration_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import TypedDict, Optional, Sequence
import re
from hun_date_parser.utils import DateTimePartConatiner, remove_accent, word_to_num, Minute
from hun_date_parser.date_parser.patterns import R_HOUR_MIN_D, R_HOUR_HOUR_D, R_HOUR_D


class DateParts(TypedDict):
match: str
date_parts: Sequence[DateTimePartConatiner]


def convert_hour_to_minutes(hour_str: Optional[str]) -> float:
"""Converts an hour string to minutes, handling special cases."""
if hour_str is None:
return 0
if ",5" in hour_str:
hour_num = word_to_num(hour_str.replace(",5", ""))
if hour_num != -1:
return (hour_num + 0.5) * 60
else:
hour_num = word_to_num(hour_str)
if hour_num != -1:
return hour_num * 60
return 0


def convert_quarter_hour(hour_str: Optional[str]) -> int:
"""Converts a quarter hour string to minutes."""
if hour_str is None:
return 0
hour_str_no_accent = remove_accent(hour_str)
if "haromnegyed" in hour_str_no_accent:
return 45
if "negyed" in hour_str:
return 15
if "fel" in hour_str_no_accent:
return 30
return 0


def duration_parser(s: str) -> DateParts:
match = re.match(R_HOUR_MIN_D, s)
if match:
hour_w, min_w = match.groups()
mins_1 = convert_hour_to_minutes(hour_w)
mins_2 = word_to_num(min_w)
res_mins = mins_1 + mins_2
else:
match = re.match(R_HOUR_D, s)
if match:
hour_w = match.groups()[0]
res_mins = convert_hour_to_minutes(hour_w)
else:
match = re.match(R_HOUR_HOUR_D, s)
if match:
hour_w, hour_w_2 = match.groups()
mins_1 = convert_hour_to_minutes(hour_w)
mins_2 = convert_quarter_hour(hour_w_2)
res_mins = mins_1 + mins_2
else:
res_mins = 0

res_date_parts = [Minute(res_mins, "duration_parser")] if res_mins > 0 else []

return {
"match": s,
"date_parts": res_date_parts
}


def parse_duration(s: str) -> Optional[int]:
"""
Returns the duration in minutes found in the input string.
:param s: Input string containing the duration information.
:return: The duration in minutes as an integer, or None if no valid duration is found.
"""
results = duration_parser(s)
return results["date_parts"][0].value if results["date_parts"] else None
55 changes: 55 additions & 0 deletions test/test_duration_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest
from datetime import datetime

from hun_date_parser.duration_parser.duration_parsers import duration_parser
from hun_date_parser.date_parser.date_parsers import Minute


tf_durations = [
("45 perc", [Minute(45, "duration_parser")]),
("45 percre ", [Minute(45, "duration_parser")]),
("50 perces", [Minute(50, "duration_parser")]),
("120 perc", [Minute(120, "duration_parser")]),
("negyed óráig", [Minute(15, "duration_parser")]),
(" háromnegyed óra", [Minute(45, "duration_parser")]),
("25 perc", [Minute(25, "duration_parser")]),
("26 perc", [Minute(26, "duration_parser")]),
("1 óra 25 perc", [Minute(85, "duration_parser")]),
("1,5 óráig", [Minute(90, "duration_parser")]),
("2,5 óráig", [Minute(150, "duration_parser")]),
("1 óráig", [Minute(60, "duration_parser")]),
("1 és negyed óráig", [Minute(75, "duration_parser")]),
("1 és negyedóráig", [Minute(75, "duration_parser")]),
("egy és fél óráig", [Minute(90, "duration_parser")]),
("félóráig", [Minute(30, "duration_parser")]),
("eddig 3 és negyed óráig", [Minute(195, "duration_parser")]),
("eddig 3 es negyed oraig", [Minute(195, "duration_parser")]),
("eddig: 1,5 óráig", [Minute(90, "duration_parser")]),
("eddig: 45 percre", [Minute(45, "duration_parser")]),
("egy óra 10 percre", [Minute(70, "duration_parser")]),
(": egy óra 10 percre :", [Minute(70, "duration_parser")]),

("16 percre", [Minute(16, "duration_parser")]),
("99 percre", [Minute(99, "duration_parser")]),
("999 percre", [Minute(999, "duration_parser")]),
("2 óra 16 percre", [Minute(136, "duration_parser")]),

("3 órára", [Minute(180, "duration_parser")]),
("4 órára", [Minute(240, "duration_parser")]),
("5 órára", [Minute(300, "duration_parser")]),
("6 órára", [Minute(360, "duration_parser")]),
("10 órára", [Minute(600, "duration_parser")]),

("16", []),
("100 órára", []),
("", []),
(" fél", []),
(" jövő kedd", []),
("délig", []),
]

@pytest.mark.parametrize("inp, exp", tf_durations)
def test_named_month(inp, exp):
duration_dct = duration_parser(s=inp)

assert duration_dct["date_parts"] == exp

0 comments on commit 323673a

Please sign in to comment.