improve performance parsing speed by about 6% by caching compiled regexes (#131)

* improve performance parsing speed by about 6% by caching compiled regexes
2023-04-12 10:52:11 +02:00 · 2023-04-12 10:52:11 +02:00 · 5a59c36646
commit 5a59c36646
parent 9847bdf66b
2 changed files with 17 additions and 22 deletions
--- a/dsmr_parser/clients/telegram_buffer.py
+++ b/dsmr_parser/clients/telegram_buffer.py
@ -1,5 +1,13 @@
 import re

+# - Match all characters after start of telegram except for the start
+# itself again '^\/]+', which eliminates incomplete preceding telegrams.
+# - Do non greedy match using '?' so start is matched up to the first
+# checksum that's found.
+# - The checksum is optional '{0,4}' because not all telegram versions
+# support it.
+_FIND_TELEGRAMS_REGEX = re.compile(r"\/[^\/]+?\![A-F0-9]{0,4}\0?\r\n", re.DOTALL)
+

 class TelegramBuffer(object):
    """
@ -8,14 +16,14 @@ class TelegramBuffer(object):
    """

    def __init__(self):
-        self._buffer = ''
+        self._buffer = ""

    def get_all(self):
        """
        Remove complete telegrams from buffer and yield them.
        :rtype generator:
        """
-        for telegram in self._find_telegrams():
+        for telegram in _FIND_TELEGRAMS_REGEX.findall(self._buffer):
            self._remove(telegram)
            yield telegram

@ -37,21 +45,3 @@ class TelegramBuffer(object):
        index = self._buffer.index(telegram) + len(telegram)

        self._buffer = self._buffer[index:]
-
-    def _find_telegrams(self):
-        """
-        Find complete telegrams in buffer from  start ('/') till ending
-        checksum ('!AB12\r\n').
-        :rtype: list
-        """
-        # - Match all characters after start of telegram except for the start
-        # itself again '^\/]+', which eliminates incomplete preceding telegrams.
-        # - Do non greedy match using '?' so start is matched up to the first
-        # checksum that's found.
-        # - The checksum is optional '{0,4}' because not all telegram versions
-        # support it.
-        return re.findall(
-            r'\/[^\/]+?\![A-F0-9]{0,4}\0?\r\n',
-            self._buffer,
-            re.DOTALL
-        )
--- a/dsmr_parser/parsers.py
+++ b/dsmr_parser/parsers.py
@ -25,8 +25,13 @@ class TelegramParser(object):
            telegram DSMR version (v4 and up).
        :type telegram_specification: dict
        """
-        self.telegram_specification = telegram_specification
        self.apply_checksum_validation = apply_checksum_validation
+        self.telegram_specification = telegram_specification
+        # Regexes are compiled once to improve performance
+        self.telegram_specification_regexes = {
+            signature: re.compile(signature, re.DOTALL)
+            for signature in self.telegram_specification['objects'].keys()
+        }

    def parse(self, telegram_data, encryption_key="", authentication_key=""):  # noqa: C901
        """
@ -80,7 +85,7 @@ class TelegramParser(object):
        telegram = Telegram()

        for signature, parser in self.telegram_specification['objects'].items():
-            pattern = re.compile(signature, re.DOTALL)
+            pattern = self.telegram_specification_regexes[signature]
            matches = pattern.findall(telegram_data)

            # Some signatures are optional and may not be present,