Merge pull request #47 from szegedai/duration_parser

Duration parser
szegedai · Jul 15, 2024 · 323673a · 323673a
2 parents d77b5d3 + 2ceb8d5
commit 323673a
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -118,6 +118,30 @@ text2datetime('péntek', now=datetime(2023, 6, 7), search_scope=SearchScopes.NOT
 #   'end_date': datetime.datetime(2023, 6, 9, 23, 59, 59)}]
 ```
 
+### Duration Parsing
+
+The duration parser can extract the duration in minutes from various expressions found in sentences.
+
+#### Recognized Formats
+
+The parser is capable of understanding a variety of duration expressions. Here are the primary formats it recognizes:
+
+- Hour and Minute Combination:
+  - Examples: `1 óra 45 perc`, `egy óra 30 perc`, `2 óra 15 perc`
+- Hour Only:
+  - Examples: `1 óra`, `egy óra`, `2 órát`, `3,5 óra`
+- Quarter Hour Phrases:
+  - Examples: `háromnegyed óra`, `egy és negyed óra`, `kettő és fél óra`
+
+
+```python
+from hun_date_parser import parse_duration
+
+print(parse_duration('45 perc'))  # Output: 45
+
+print(parse_duration('1 és negyed óra'))  # Output: 75
+```
+
 ### Datetime to text
 
 The library is also capable of turning datetime objects into their Hungarian text representation.

diff --git a/hun_date_parser/__init__.py b/hun_date_parser/__init__.py
@@ -1,6 +1,8 @@
 from hun_date_parser.date_textualizer.datetime_textualizer import DatetimeTextualizer, datetime2text
 from hun_date_parser.date_parser.datetime_extractor import DatetimeExtractor, text2datetime, text2date, text2time
+from hun_date_parser.duration_parser.duration_parsers import parse_duration
 
-__all__ = ["DatetimeTextualizer", "DatetimeExtractor", "datetime2text", "text2datetime", "text2date", "text2time"]
+__all__ = ["DatetimeTextualizer", "DatetimeExtractor", "datetime2text", "text2datetime", "text2date", "text2time",
+           "parse_duration"]
 
-__version__ = "0.2.8"
+__version__ = "0.2.9"
diff --git a/hun_date_parser/date_parser/patterns.py b/hun_date_parser/date_parser/patterns.py
@@ -66,3 +66,8 @@
 
 # kezdo: X ...: Y
 R_START_STATED_END_IMPLIED = r"(?:[kK]ezd|[iI]ndul).{1,20}:\s*([^:]{1,50}?)\s*:\s*([^:]{1,50})$"
+
+# Duration
+R_HOUR_MIN_D = r'.*?(?:([0-9]{1,2}|nulla|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z|tizenegy|tizenkett[oő]|tizenh[aá]rom|tizenn[eé]gy|tizen[oö]t|tizenhat|tizenh[eé]t|tizennyolc|tizenkilenc|h[uú]sz|huszonegy|huszonkett[oöő]|huszonkettő|huszonhárom) [óo]ra)? ?([a-zA-Z0-9]{1,15}) perc'
+R_HOUR_D = r'.*?(?:([0-9]{1,2}|nulla|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z|tizenegy|tizenkett[oő]|tizenh[aá]rom|tizenn[eé]gy|tizen[oö]t|tizenhat|tizenh[eé]t|tizennyolc|tizenkilenc|h[uú]sz|huszonegy|huszonkett[oöő]|huszonkettő|huszonhárom|[0-9],5)) ?[óo]r[aá]?'
+R_HOUR_HOUR_D = r'.*?(?:([0-9]{1,2}|egy|kett[oöő]|h[aá]rom|n[eé]gy|[öo]t|hat|h[eé]t|nyolc|kilenc|t[ií]z)?(?: [eé]s )? ?(h[aá]romnegyed|negyed|f[eé]l) ?[oó]r[aá])'
diff --git a/hun_date_parser/duration_parser/__init__.py b/hun_date_parser/duration_parser/__init__.py
diff --git a/hun_date_parser/duration_parser/duration_parsers.py b/hun_date_parser/duration_parser/duration_parsers.py
@@ -0,0 +1,78 @@
+from typing import TypedDict, Optional, Sequence
+import re
+from hun_date_parser.utils import DateTimePartConatiner, remove_accent, word_to_num, Minute
+from hun_date_parser.date_parser.patterns import R_HOUR_MIN_D, R_HOUR_HOUR_D, R_HOUR_D
+
+
+class DateParts(TypedDict):
+    match: str
+    date_parts: Sequence[DateTimePartConatiner]
+
+
+def convert_hour_to_minutes(hour_str: Optional[str]) -> float:
+    """Converts an hour string to minutes, handling special cases."""
+    if hour_str is None:
+        return 0
+    if ",5" in hour_str:
+        hour_num = word_to_num(hour_str.replace(",5", ""))
+        if hour_num != -1:
+            return (hour_num + 0.5) * 60
+    else:
+        hour_num = word_to_num(hour_str)
+        if hour_num != -1:
+            return hour_num * 60
+    return 0
+
+
+def convert_quarter_hour(hour_str: Optional[str]) -> int:
+    """Converts a quarter hour string to minutes."""
+    if hour_str is None:
+        return 0
+    hour_str_no_accent = remove_accent(hour_str)
+    if "haromnegyed" in hour_str_no_accent:
+        return 45
+    if "negyed" in hour_str:
+        return 15
+    if "fel" in hour_str_no_accent:
+        return 30
+    return 0
+
+
+def duration_parser(s: str) -> DateParts:
+    match = re.match(R_HOUR_MIN_D, s)
+    if match:
+        hour_w, min_w = match.groups()
+        mins_1 = convert_hour_to_minutes(hour_w)
+        mins_2 = word_to_num(min_w)
+        res_mins = mins_1 + mins_2
+    else:
+        match = re.match(R_HOUR_D, s)
+        if match:
+            hour_w = match.groups()[0]
+            res_mins = convert_hour_to_minutes(hour_w)
+        else:
+            match = re.match(R_HOUR_HOUR_D, s)
+            if match:
+                hour_w, hour_w_2 = match.groups()
+                mins_1 = convert_hour_to_minutes(hour_w)
+                mins_2 = convert_quarter_hour(hour_w_2)
+                res_mins = mins_1 + mins_2
+            else:
+                res_mins = 0
+
+    res_date_parts = [Minute(res_mins, "duration_parser")] if res_mins > 0 else []
+
+    return {
+        "match": s,
+        "date_parts": res_date_parts
+    }
+
+
+def parse_duration(s: str) -> Optional[int]:
+    """
+    Returns the duration in minutes found in the input string.
+    :param s: Input string containing the duration information.
+    :return: The duration in minutes as an integer, or None if no valid duration is found.
+    """
+    results = duration_parser(s)
+    return results["date_parts"][0].value if results["date_parts"] else None
diff --git a/test/test_duration_parser.py b/test/test_duration_parser.py
@@ -0,0 +1,55 @@
+import pytest
+from datetime import datetime
+
+from hun_date_parser.duration_parser.duration_parsers import duration_parser
+from hun_date_parser.date_parser.date_parsers import Minute
+
+
+tf_durations = [
+    ("45 perc", [Minute(45, "duration_parser")]),
+    ("45 percre ", [Minute(45, "duration_parser")]),
+    ("50 perces", [Minute(50, "duration_parser")]),
+    ("120 perc", [Minute(120, "duration_parser")]),
+    ("negyed óráig", [Minute(15, "duration_parser")]),
+    (" háromnegyed óra", [Minute(45, "duration_parser")]),
+    ("25 perc", [Minute(25, "duration_parser")]),
+    ("26 perc", [Minute(26, "duration_parser")]),
+    ("1 óra 25 perc", [Minute(85, "duration_parser")]),
+    ("1,5 óráig", [Minute(90, "duration_parser")]),
+    ("2,5 óráig", [Minute(150, "duration_parser")]),
+    ("1 óráig", [Minute(60, "duration_parser")]),
+    ("1 és negyed óráig", [Minute(75, "duration_parser")]),
+    ("1 és negyedóráig", [Minute(75, "duration_parser")]),
+    ("egy és fél óráig", [Minute(90, "duration_parser")]),
+    ("félóráig", [Minute(30, "duration_parser")]),
+    ("eddig 3 és negyed óráig", [Minute(195, "duration_parser")]),
+    ("eddig 3 es negyed oraig", [Minute(195, "duration_parser")]),
+    ("eddig: 1,5 óráig", [Minute(90, "duration_parser")]),
+    ("eddig: 45 percre", [Minute(45, "duration_parser")]),
+    ("egy óra 10 percre", [Minute(70, "duration_parser")]),
+    (": egy óra 10 percre :", [Minute(70, "duration_parser")]),
+
+    ("16 percre", [Minute(16, "duration_parser")]),
+    ("99 percre", [Minute(99, "duration_parser")]),
+    ("999 percre", [Minute(999, "duration_parser")]),
+    ("2 óra 16 percre", [Minute(136, "duration_parser")]),
+
+    ("3 órára", [Minute(180, "duration_parser")]),
+    ("4 órára", [Minute(240, "duration_parser")]),
+    ("5 órára", [Minute(300, "duration_parser")]),
+    ("6 órára", [Minute(360, "duration_parser")]),
+    ("10 órára", [Minute(600, "duration_parser")]),
+
+    ("16", []),
+    ("100 órára", []),
+    ("", []),
+    ("  fél", []),
+    ("  jövő kedd", []),
+    ("délig", []),
+]
+
+@pytest.mark.parametrize("inp, exp", tf_durations)
+def test_named_month(inp, exp):
+    duration_dct = duration_parser(s=inp)
+
+    assert duration_dct["date_parts"] == exp