Mini Shell

Direktori : /opt/imunify360/venv/lib/python3.11/site-packages/im360/subsys/
Current File : //opt/imunify360/venv/lib/python3.11/site-packages/im360/subsys/modsec_audit_log.py
"""
SecAuditLog parser
"""
import json
import ipaddress
import os
import os.path
import posixpath
import re
import urllib.parse
from abc import ABCMeta, abstractmethod
from contextlib import suppress
from copy import copy
from http.cookies import CookieError, SimpleCookie
from itertools import product
from logging import getLogger
from typing import Optional, Tuple
from urllib.parse import parse_qs
from defence360agent.utils.common import DAY, rate_limit
from defence360agent.utils import USER_IDENTITY_FIELD, user_identity
from im360.contracts.config import ModsecSensor

ANOMALITY_SCORE_FIELDS = (
    "inbound_anomality_score",
    "outbound_anomality_score",
)
SEVERITY, ADVANCED, HEADERS = "severity", "advanced", "headers"
URI, HTTP_METHOD, FORM_PARAMS, QUERY_PARAMS, ATTACKERS_IP, STATUS_CODE = (
    "uri",
    "http_method",
    "form",
    "query",
    "attackers_ip",
    "status_code",
)
ENGINE_MODE = "engine_mode"

# fields to pass from ModSecurity to Imunify sensor socket
PICK_SECAUDITLOG_FIELDS = (
    "User-Agent",
    "Host",
    ATTACKERS_IP,
    "transaction_id",
    "rule",
    "msg",
    "message",
    "access_denied",
    SEVERITY,
    "ver",
    "tag",
    "Producer",
    "modsec_version",
    "vendor",
    STATUS_CODE,
    ENGINE_MODE,
    ADVANCED,
) + ANOMALITY_SCORE_FIELDS
_COOKIE, _ = _SENSITIVE_HEADERS = ("cookie", "authorization")

_PARSE_SCORE_REGEX = re.compile(
    r"(Inbound|Outbound) Anomaly Score.*?(\d+)", re.IGNORECASE
)

logger = getLogger(__name__)
throttled_log_exception = rate_limit(period=DAY)(logger.exception)


class ParseError(RuntimeError):
    """
    log as logger.exception(*e.args) to avoid sentry duplicates
    """

    pass


class _AmbiguousSeverity(RuntimeError):
    """
    to log as warning
    """

    pass


class MalformedFileError(RuntimeError):
    """
    log as logger.exception(*e.args) to avoid sentry duplicates
    """

    pass


class _MiscDataNotInteresting(Exception):
    pass


class _SerialLogSectionParser(metaclass=ABCMeta):
    _IS_APPLICABLE_SINCE_REGEX = None

    @classmethod
    def is_applicable_since(cls, line):
        assert cls._IS_APPLICABLE_SINCE_REGEX, "regex should be implemented"
        return cls._IS_APPLICABLE_SINCE_REGEX.match(line) is not None

    @classmethod
    @abstractmethod
    def parse_name_value_tokens(cls, line):
        """
        :return: name, value tokens
        :raise ValueError: if cannot parse string
        """
        pass


class _SectionAParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}A--$")
    _PARSE_IP_REGEX = re.compile(r"^\[.*?\] (?P<id>\S+) (?P<ip>\S+)")

    @classmethod
    def parse_name_value_tokens(cls, line):
        match = cls._PARSE_IP_REGEX.match(line)
        if not match:
            logger.warning(
                "--section-A--: cannot parse this line: %s", repr(line)
            )
            raise _MiscDataNotInteresting()
        try:
            ipv4_or_ipv6 = ipaddress.ip_address(match.group("ip"))
        except ValueError:
            logger.warning(
                "--section-A--: cannot use %s for IPv4 or IPv6 address",
                match.group(1),
            )
            raise _MiscDataNotInteresting()

        yield "attackers_ip", str(ipv4_or_ipv6)
        yield "transaction_id", match.group("id")


class _SectionBParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}B--$")
    _PARSE_NAME_VALUE = re.compile(r"^(\S*): ?(.*)$")

    @classmethod
    def parse_name_value_tokens(cls, line):
        match = cls._PARSE_NAME_VALUE.match(line)
        if match:
            # supply None instead of an empty string
            name, value = match.group(1), match.group(2) or None
            (
                maybe_obfuscated_value,
                is_required,
            ) = obfuscate_if_sensitive_and_required(name, value)
            if is_required:
                yield HEADERS, [name, maybe_obfuscated_value]
        else:
            try:
                # GET /a/c/getName/?param1=abs HTTP/1.1 HTTP/1.1
                method, uri, *_ = line.split()
                if method.isupper() and os.path.isabs(uri):
                    yield HTTP_METHOD, method

                    parsed = urllib.parse.urlparse(uri)
                    yield URI, parsed.path
                    if ModsecSensor.SEND_ADDITIONAL_DATA:
                        yield QUERY_PARAMS, obfuscate(parsed.query)
            except ValueError:
                raise _MiscDataNotInteresting()


class _SectionCParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}C--$")

    @classmethod
    def parse_name_value_tokens(cls, line):
        if ModsecSensor.SEND_ADDITIONAL_DATA:
            yield FORM_PARAMS, obfuscate(line)


class _SectionFParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}F--$")
    _STATUS_CODE_REGEX = re.compile(r"^HTTP/\d\.\d (\d+).*$")

    @classmethod
    def parse_name_value_tokens(cls, line):
        match = cls._STATUS_CODE_REGEX.match(line)
        if not match:
            raise _MiscDataNotInteresting()
        else:
            yield STATUS_CODE, match.group(1)


class _BaseSectionHParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}H--$")
    _PARSE_MESSAGE_APACHE_ERROR = re.compile(
        r"([^=]*) (ModSecurity):\s+([^=]*)"
    )
    _PARSE_MESSAGE_REGEX = re.compile(r'(\[\w+ ".*?"\])')
    _PARSE_ACCESS_DENIED_CODE = re.compile(r"Access denied with code \d+")
    _PARSE_MESSAGE_TOKENS_REGEX = re.compile(r'\[(\w+) "(.*?)"\]')
    _INCOMPLETE_MESSAGE_REGEX = re.compile(r'(\[\w+ ".*?$)')

    @classmethod
    def parse_name_value_tokens(cls, line):
        name, value = cls._parse_name_value(line)
        # ModSecurity name used for modsec version 3.x
        if name in ["Message", "ModSecurity"]:
            return name, *cls._parse_message(value, line)
        if name == "Apache-Error":
            return cls._parse_apache_err(value, line)
        # note: ModSecurity v3 with `SecAuditLogFormat Native` has not
        # `Engine-Mode` (secrules_engine) field currently
        if name == "Engine-Mode":
            name = ENGINE_MODE
            value = value.strip('"')  # remove surrounding double quotes
        return name, value

    @classmethod
    def _parse_name_value(cls, s):
        """
        :raise ValueError: if cannot parse string
        """
        name, value = s.split(maxsplit=1)

        if name[-1] != ":":
            raise ValueError('Expecting sort of "Name: value"')
        else:
            # chop colon character
            name = name[:-1]

        return name, value

    @classmethod
    def _parse_apache_err(cls, msgval, _):
        match = cls._PARSE_MESSAGE_APACHE_ERROR.match(msgval)
        if match:
            _, name, modsec = match.groups()
            return name, *cls._parse_message(modsec, "")
        else:
            # Apache-Error line dosn't have ModSecurity part
            raise _MiscDataNotInteresting()

    @classmethod
    def _parse_message(cls, msgval, audit_log_line):
        """
        Parse string with format [key "val"] [key "val"] ... [key "val"]
        to dictionary {"key": "val", "key": "val", ...}

        In case when there is uncompleted value part
            e.g. '[key "val'
             with lost '"]' in the end
        it also returns incomplete part as string '[key "val'
        """
        tokens_list = cls._PARSE_MESSAGE_REGEX.findall(msgval)
        tokens = " ".join(tokens_list)
        msgtxt = msgval
        for token in tokens_list:
            msgtxt = msgtxt.replace(token, "").strip()
        incomplete_part = cls._INCOMPLETE_MESSAGE_REGEX.findall(msgtxt)
        for token in incomplete_part:
            msgtxt = msgtxt.replace(token, "").strip()
        result = dict(message=msgtxt)
        match = cls._PARSE_ACCESS_DENIED_CODE.search(msgtxt)
        if match:
            result["access_denied"] = True
        for token in cls._PARSE_MESSAGE_TOKENS_REGEX.finditer(tokens):
            name, text = token.groups()
            existing = result.get(name)
            if existing is not None:
                if name == "severity":
                    logger.warning("Ambiguous severity: %r", audit_log_line)
                    raise _AmbiguousSeverity()
                if not isinstance(existing, list):
                    # convert multiple token entries into a list
                    existing = result[name] = [existing]
                existing.append(text)
            elif name == "tag":
                # 'tag' is always a list
                result[name] = [text]
            elif name == "id":
                # "rule_id" to be consistent with other plugins
                result["rule"] = text
                if text is None:
                    throttled_log_exception(
                        "rule field is None, modsec message: %s", msgval
                    )
            else:
                result[name] = text

            if name == "msg":
                anomality_score_items = get_anomality_score_items(text)
                result.update(anomality_score_items)

        return result, *incomplete_part


class _SectionHParser(_BaseSectionHParser):
    """
    _BaseSectionHParser + DEF-2617 workaround
    """

    _INCOMPLETE_RESULT = {
        "name": "",
        "value": dict(),
        "incomplete_part": "",
        "count": 0,
    }
    _MAX_INCOMPLETE_MESSAGE = 4

    def __init__(self):
        self._open_matched_string = False
        self._incomplete_result = copy(self._INCOMPLETE_RESULT)

    def parse_name_value_tokens(self, line):
        """
        In case with correct line e.g.
                Name: Message [key "val"] [key "val"] [key "val"]
            if there is no data in self._incomplete_result:
                returns result of _BaseSectionHParser.parse_name_value_tokens
            else
                returns name, value of self._incomplete_result
                    and name, value of current line
                it is possible since this function is generator.
        In case with incomplete line e.g.
                Name: Message [key "val"] [key "val"] [key "val
            if there is no data in self._incomplete_result:
                collect name, value and incomplete part values
                that was got from _BaseSectionHParser.parse_name_value_tokens
                to self._incomplete_result and wait next line.
                returns empty list
            else:
                does the same but
                returns name, value of self._incomplete_result
        In case with non 'name: value' format:
            if there is no data in self._incomplete_result:
                raises
            else:
                Concatenate incomplete result of previous
                iteration with current line and parse it
                by _BaseSectionHParser._parse_message.
                if there is still incomplete line:
                    it updates current self._incomplete_result and
                    returns empty list
                else:
                    returns name, value from collected data
        """
        result = []
        try:
            _res = super().parse_name_value_tokens(line)
        except ValueError:
            _count = self._incomplete_result["count"]
            if not _count or _count > self._MAX_INCOMPLETE_MESSAGE:
                if not self._open_matched_string:
                    raise
                else:
                    raise _MiscDataNotInteresting()

            _res = self._parse_message(
                self._incomplete_result["incomplete_part"] + line, line
            )
            _res[0].pop("message")
            if len(_res) == 1:
                self._incomplete_result["value"].update(_res[0])
                name = self._incomplete_result["name"]
                value = self._incomplete_result["value"]
                result.append((name, value))
                self._incomplete_result = copy(self._INCOMPLETE_RESULT)
            else:
                self._incomplete_result["value"].update(_res[0])
                self._incomplete_result["incomplete_part"] = _res[1]
                self._incomplete_result["count"] += 1
        else:
            # If line was successfully parsed
            if self._incomplete_result["count"]:
                _name = self._incomplete_result["name"]
                _value = self._incomplete_result["value"]
                _value["message"] += self._incomplete_result["incomplete_part"]
                result.append((_name, _value))
            if len(_res) == 3:
                name, value, incomplete = _res[0], _res[1], _res[2]
                if name == "Message" and "[MatchedString" in line:
                    value["message"] += incomplete
                    result.append((name, value))
                    self._incomplete_result = copy(self._INCOMPLETE_RESULT)
                else:
                    self._incomplete_result["name"] = name
                    self._incomplete_result["value"] = value
                    self._incomplete_result["incomplete_part"] = incomplete
                    self._incomplete_result["count"] = 1
            else:
                name, value = _res[0], _res[1]
                result.append((name, value))
                self._incomplete_result = copy(self._INCOMPLETE_RESULT)
            self._open_matched_string = (
                name == "Message" and "[MatchedString" in line
            )
        finally:
            for name, value in result:
                yield name, value


class _SectionZParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}Z--$")

    @classmethod
    def parse_name_value_tokens(cls, line):
        raise NotImplementedError("Expecting this method never be called.")


class _SectionNotInterestedInParser(_SerialLogSectionParser):
    _IS_APPLICABLE_SINCE_REGEX = re.compile(r"^-{2,3}\w+-{1,3}[^ABHZ]--$")

    @classmethod
    def parse_name_value_tokens(cls, line):
        raise _MiscDataNotInteresting()


class Parser(metaclass=ABCMeta):
    # all of ascii table from 32 to 126 (printable ascii chars) but, '%'
    _SAFE_UNQUOTED = "".join(
        chr(c)
        for c in range(
            32,
            # off-by-1 obviously
            126 + 1,
        )
        if chr(c) != "%"
    )

    @abstractmethod
    def feed(self, bytes_):
        """
        :param bytes_: audit log line
        :return dict: for complete result and None for incomplete.
                      See unit tests for result dict fields.
        :raise ParseError:
        """
        pass

    def flush(self):
        pass

    @classmethod
    def _adaptive_decode_bytes(cls, bytes_):
        """
        apply fallback on UnicodeDecodeError
        """
        try:
            return cls.decode_bytes(bytes_)
        except UnicodeDecodeError:
            return urllib.parse.quote_from_bytes(
                bytes_, safe=cls._SAFE_UNQUOTED
            )

    @staticmethod
    def decode_bytes(bytes_):
        """
        because cannot mock 'read-only' bstr.decode() attr
        """
        return bytes_.decode()


class _RevolverParser(Parser):
    # skip first batch of errors if Parsers starts parsing
    # in the middle of serial log on agent restart
    _SKIP_FIRST_NUM_ERRORS = 11
    _available_parsers = []

    def __init__(self, audit_logdir_path=None):
        """
        :param str audit_logdir_path: audit log dir path for ModSecurity
            concurrent mode
        """
        self._audit_logdir_path = audit_logdir_path
        self._revolve()
        self._skip_first_num_errors = self._SKIP_FIRST_NUM_ERRORS

    def feed(self, bytes_):
        try:
            result = self._feed_impl(bytes_)
        except ParseError:
            if self._skip_first_num_errors > 0:
                # skip "first num errors"
                self._skip_first_num_errors -= 1
                return None
            else:
                raise
        else:
            if bytes_.strip():
                # starting from the first non empty and succefully processed
                # token we expect no errors
                self._skip_first_num_errors = 0

        return result

    def flush(self):
        return self._current_parser.flush()

    def _feed_impl(self, bytes_):
        for attempt in range(1, len(self._available_parsers) + 1):
            try:
                return self._current_parser.feed(bytes_)
            except ParseError as e:
                if attempt < len(self._available_parsers):
                    # give the next parser a chance...
                    self._revolve()
                else:  # raise on last attempt
                    raise ParseError(
                        "Neither of available parsers is capable "
                        "to parse this: %r ",
                        bytes_,
                    ) from e

    def _revolve(self):
        """Exchange parser to the next"""

        assert (
            len(self._available_parsers) >= 2
        ), "Count of available_parsers should not be less than 2"
        logger.info(
            "Swap %s<->%s",
            self._available_parsers[0].__name__,
            self._available_parsers[1].__name__,
        )

        self._available_parsers = (
            self._available_parsers[1:] + self._available_parsers[:1]
        )

        # use blank parser state
        cls = self._available_parsers[0]
        parser_kwargs = dict()
        if self._audit_logdir_path is not None:
            parser_kwargs["audit_logdir_path"] = self._audit_logdir_path
        self._current_parser = cls(**parser_kwargs)


class _JsonLogParserBase(Parser):
    def __init__(self, **_):
        self._accumulating_result = {}
        self._accumulating_lines = []

    def parse_json_data(self, data: dict) -> dict:
        raise NotImplementedError()

    def feed(self, bytes_):
        line = self._adaptive_decode_bytes(bytes_.rstrip())
        if line:
            try:
                self._accumulating_lines.append(line)
                self._accumulating_result = self.parse_json_data(
                    json.loads(line)
                )
                return self.flush()
            except _MiscDataNotInteresting:
                return None
            except Exception as e:
                raise ParseError(
                    "Error occurs while parse json line %r, reason: %r"
                    % (line, e)
                ) from e

    def flush(self):
        try:
            return mk_modsec_indicent_list(
                self._accumulating_result, debug_ctx=self._accumulating_lines
            )
        finally:
            self._accumulating_result = {}
            self._accumulating_lines = []


class _JsonLogParserv2(_JsonLogParserBase):
    """Parse ModSecurity v2 json log entries."""

    def parse_json_data(self, data: dict) -> dict:
        """
        The contents of the log entry can be seen here
        https://github.com/SpiderLabs/ModSecurity/blob/v2/master/apache2/msc_logging.c#L644
        Optional fields may be omitted due to SecAuditLogParts setting.
        Expected data (some unused keys are omitted):
        {'transaction': {
                'time': str,
                'transaction_id': str,
                'remote_address': str,
                'remote_port': int,
                'local_address': str,
                'local_port': int
            },
         'request': {
                'request_line': str,  # optional field
                'body': [],  # optional field
                'headers': {},  # optional field
            },
         'response': {
                'protocol': str,  # optional field
                'body': str,  # optional field
                'headers': {},  # optional field
                'status': int,  # optional field
            },
         'audit_data': {
                'producer': str or [],  # optional field
                'messages': [],  # optional field
                'error_messages': [],  # optional field
                'engine_mode': str,  # optional field
            }
         }
        """  # noqa: E501
        expected_data_keys = {
            "transaction",
            "request",
            "response",
            "audit_data",
        }
        if not expected_data_keys.issubset(data.keys()):
            raise ParseError("Not expected json data for ModSecurity v2")
        result = {}
        transaction = data["transaction"]
        additional_data_is_required = ModsecSensor.SEND_ADDITIONAL_DATA
        # section A
        try:
            result[ATTACKERS_IP] = str(
                ipaddress.ip_address(transaction["remote_address"])
            )
        except ValueError:
            logger.warning(
                "Cannot use %s for IPv4 or IPv6 address",
                transaction["remote_address"],
            )
            raise _MiscDataNotInteresting()
        result["transaction_id"] = transaction.get("transaction_id", "-")
        # section B
        result[HEADERS] = []
        for header, value in data["request"].get("headers", {}).items():
            (
                maybe_obfuscated_value,
                is_required,
            ) = obfuscate_if_sensitive_and_required(header, value)
            if is_required:
                result[HEADERS].append([header, maybe_obfuscated_value])
        if data["request"].get("request_line"):
            # GET /a/c/getName/?param1=abs HTTP/1.1
            method, uri, *_ = data["request"]["request_line"].split()
            if method.isupper() and os.path.isabs(uri):
                result[HTTP_METHOD] = method
                parsed = urllib.parse.urlparse(uri)
                result[URI] = parsed.path
                if additional_data_is_required:
                    result[QUERY_PARAMS] = obfuscate(parsed.query)
        # section C
        if data["request"].get("body") and additional_data_is_required:
            result[FORM_PARAMS] = obfuscate("".join(data["request"]["body"]))
        # section F
        if data["response"].get("status"):
            result[STATUS_CODE] = data["response"]["status"]
        # section H
        if data["audit_data"].get("producer"):
            if isinstance(data["audit_data"]["producer"], str):
                version, _ = get_producer_data(data["audit_data"]["producer"])
                result["modsec_version"] = version
            else:
                modsec_full_name, *vendors = data["audit_data"]["producer"]
                version, _ = get_producer_data(modsec_full_name)
                result["vendor"] = vendors
                result["modsec_version"] = version
        if data["audit_data"].get("engine_mode"):
            result[ENGINE_MODE] = data["audit_data"]["engine_mode"]
        if data["audit_data"].get("messages"):
            result["MessageList"] = [
                _SectionHParser._parse_message(msg, "")[0]
                for msg in data["audit_data"]["messages"]
            ]
        return result


class _KeyMappedJsonLogParserv3(_JsonLogParserBase):
    """Parse ModSecurity v3 json log entries."""

    KEYS_MAPPER = {}  # Defines field mappings

    def _to_lower_keys(self, obj):
        """
        Assume that *obj* is json serializeble.
        """
        if isinstance(obj, dict):
            # Leave the headers as is
            return {
                k.lower(): (
                    self._to_lower_keys(v) if k.lower() != "headers" else v
                )
                for k, v in obj.items()
            }
        elif isinstance(obj, list):
            return [self._to_lower_keys(item) for item in obj]
        else:
            return obj

    def parse_json_data(self, data: dict) -> dict:
        """
        Parse log entry data for JSON SecAuditLogFormat.
        https://github.com/SpiderLabs/ModSecurity/wiki/Reference-Manual-(v2.x)#SecAuditLogFormat
        The contents of the log entry can be seen here
        https://github.com/SpiderLabs/ModSecurity/blob/v3/master/src/transaction.cc#L1622
        Optional fields may be omitted due to SecAuditLogParts setting.
        Expected data:
        {'transaction': {
                 'client_ip': str,
                 'time_stamp': str,
                 'server_id': str,
                 'client_port': int,
                 'host_ip': str,
                 'host_port': int,
                 'unique_id': str,
                 'request': {
                             'http_version': str,
                             'method': str,
                             'uri': str,
                             'body': str,  # optional field
                             'headers': {},  # optional field
                             },
                 'response': {
                              'body': str,  # optional field
                              'headers': {},  # optional field
                              'http_code': int,
                              },
                 # optional fields below
                 'producer': {
                              'components': [],
                              'connector': str,
                              'modsecurity': str,
                              'secrules_engine': str,
                              },
                 'messages': [{'details': {'accuracy': str,
                                           'data': str,
                                           'file': str
                                           'lineNumber': str,
                                           'match': str,
                                           'maturity': str,
                                           'reference': str,
                                           'rev': '1',
                                           'ruleId': str,
                                           'severity': str,
                                           'tags': [],
                                           'ver': str},
                               'message': str}]}}
        """  # noqa: E501
        if not self.KEYS_MAPPER:
            # Ensuring we are executing in the inheritor with required class attr defined
            raise NotImplementedError(
                "_KeyMappedJsonLogParserv3 misses KEYS_MAPPER definition"
            )
        error_data = {
            "error": "ModSecurity was not compiled with JSON support."
        }
        if data == error_data:
            logger.warning(
                "SecAuditLogFormat set to JSON, but "
                "ModSecurity was not compiled with JSON support."
            )
            raise _MiscDataNotInteresting()
        expected_data_keys = {"transaction"}
        if data.keys() != expected_data_keys:
            raise ParseError("Not expected json data for Coraza/ModSecurityV3")
        # to avoid possible key format changes in ModSecurity
        # convert dict keys to lowercase
        data = self._to_lower_keys(data)
        result = {}
        transaction = data["transaction"]
        additional_data_is_required = ModsecSensor.SEND_ADDITIONAL_DATA
        # section A
        try:
            result[ATTACKERS_IP] = str(
                ipaddress.ip_address(transaction["client_ip"])
            )
        except ValueError:
            logger.warning(
                "Cannot use %s for IPv4 or IPv6 address",
                transaction["client_ip"],
            )
            raise _MiscDataNotInteresting()

        transaction_id = transaction.get(
            self.KEYS_MAPPER["transaction_id"], None
        )
        if transaction_id is None:
            raise ParseError(
                "Missing unique id field in json data for Coraza/ModSecurityV3"
            )
        # looks like there could be both string and int values,
        # so let's make it a string
        result["transaction_id"] = str(transaction_id)

        result[HEADERS] = []
        for header, value in transaction["request"].get("headers", {}).items():
            (
                maybe_obfuscated_value,
                is_required,
            ) = obfuscate_if_sensitive_and_required(header, value)
            if is_required:
                result[HEADERS].append([header, maybe_obfuscated_value])

        result[HTTP_METHOD] = transaction["request"]["method"]
        parsed = urllib.parse.urlparse(transaction["request"]["uri"])
        result[URI] = parsed.path
        if additional_data_is_required:
            result[QUERY_PARAMS] = obfuscate(parsed.query)
        # section C
        if transaction["request"].get("body") and additional_data_is_required:
            result[FORM_PARAMS] = obfuscate(transaction["request"]["body"])
        # section F
        result[STATUS_CODE] = transaction["response"].get(
            self.KEYS_MAPPER["status_code"]
        )
        # section H
        if transaction.get("producer"):
            result["vendor"] = transaction["producer"]["components"]
            version, _ = get_producer_data(
                transaction["producer"]["modsecurity"]
            )
            result["modsec_version"] = version
            result[ENGINE_MODE] = transaction["producer"].get(
                self.KEYS_MAPPER["secrules_engine"]
            )
        if transaction.get("messages"):
            result["MessageList"] = []
            for msg in transaction["messages"]:
                message = msg["details"]
                message["rule"] = str(message.pop("ruleid"))
                message["msg"] = (
                    msg["msg"] if "msg" in msg else msg.get("message")
                )

                # Headers might be in lowercase e.g.:
                #  tests/core/fixtures/test_modsec_audit_log/
                #   test_parser_log_sample-13_json_v3
                #   test_parser_log_sample-29_pretty_json_litespeed
                headers = transaction["request"]["headers"]
                hostname = headers.get("Host") or headers.get("host")
                message["hostname"] = hostname
                message["access_denied"] = result["status_code"] == 403
                if message.get("msg"):
                    message.update(get_anomality_score_items(message["msg"]))
                result["MessageList"].append(message)
        return result


class _JsonLogParserv3(_KeyMappedJsonLogParserv3):
    KEYS_MAPPER = {
        "transaction_id": "unique_id",
        "status_code": "http_code",
        "secrules_engine": "secrules_engine",
    }


class _JsonLogParserCoraza(_KeyMappedJsonLogParserv3):
    KEYS_MAPPER = {
        "transaction_id": "id",
        "status_code": "status",
        "secrules_engine": "rules_engine",
    }


class _JsonPrettyLogParser(_JsonLogParserv3):
    def feed(self, bytes_):
        line = self._adaptive_decode_bytes(bytes_.rstrip())
        if line:
            if line == "{":
                if self._accumulating_lines:
                    self._accumulating_lines = []
                    raise ParseError(
                        "Error occurs while parse json line %r, "
                        "not empty line buffer on start new json"
                    )
            else:
                if not self._accumulating_lines:
                    raise ParseError(
                        "Error occurs while parse json line %r, "
                        "empty line buffer in a middle of json dict"
                    )
            self._accumulating_lines.append(line)
            if line == "}":
                try:
                    self._accumulating_result = self.parse_json_data(
                        json.loads("".join(self._accumulating_lines))
                    )
                    return self.flush()
                except _MiscDataNotInteresting:
                    return None
                except Exception as e:
                    raise ParseError(
                        "Error occurs while parse json line %r, reason: %r"
                        % (line, e)
                    ) from e


class _NativeLogParser(Parser):
    """
    audit log parser state machine

    Parse log entry ModSecurity 2 Data for Native audit log format.
    https://github.com/SpiderLabs/ModSecurity/wiki/ModSecurity-2-Data-Formats#Audit_Log

    More about ModSecurity log message format
    https://gerrit.cloudlinux.com/plugins/gitiles/defence360/+/refs/changes/73/111973/1/opt/DEF-21076-multiline-modsec-header-field-alert/README.txt

    Expected data:
    --2172df61-A--
    [02/Feb/2018:11:49:22 +0000] WnRQQvAURie7wq9bOfH25AAAAAE ::1 57480 ::1 80
    --2172df61-B--
    POST /1/request?x=yxz&z=xy HTTP/1.1
    User-Agent: Nessus
    Host: localhost
    Accept: */*
    cOOkIe: ABC=abc;SESSIONID=hash256
    Content-Length: 9
    Content-Type: application/x-www-form-urlencoded

    --2172df61-C--
    bc=d123&cd=e&bc=a0
    --2172df61-F--
    HTTP/1.1 404 Not Found
    Accept-Ranges: bytes
    Transfer-Encoding: chunked
    Content-Type: text/html

    --2172df61-H--
    Message: Warning. Matched phrase "nessus" at REQUEST_HEADERS:User-Agent. [file "/etc/apache2/conf.d/modsec_vendor_configs/imunify360_full_apache/103_Global_Agents.conf"] [line "17"] [id "210801"] [rev "2"] [msg "COMODO WAF: Request Indicates a Security Scanner Scanned the Site||localhost|F|2"] [data "nessus"] [severity "CRITICAL"] [tag "CWAF"] [tag "Agents"]
    Message: Warning. Pattern match "(?i:(?:^(?:microsoft url|user-Agent|www\\.weblogs\\.com|(?:jakart|vi)a|(google|i{0,1}explorer{0,1}\\.exe|(ms){0,1}ie( [0-9.]{1,}){0,1} {0,1}(compatible( browser){0,1}){0,1})$)|\\bdatacha0s\\b|; widows|\\\\r|a(?: href=|d(?:sarobot|vanced email extractor ..." at REQUEST_HEADERS:User-Agent. [file "/etc/apache2/conf.d/modsec_vendor_configs/imunify360_full_apache/103_Global_Agents.conf"] [line "29"] [id "210831"] [rev "2"] [msg "COMODO WAF: Rogue web site crawler||localhost|F|4"] [data "Nessus"] [severity "WARNING"] [tag "CWAF"] [tag "Agents"]
    Message: Warning. Operator GE matched 5 at TX:incoming_points. [file "/etc/apache2/conf.d/modsec_vendor_configs/imunify360_full_apache/122_Outgoing_FiltersEnd.conf"] [line "35"] [id "214930"] [rev "1"] [msg "COMODO WAF: Inbound Points Exceeded|Total Incoming Points: 8|localhost|F|2"] [severity "CRITICAL"] [tag "CWAF"] [tag "FiltersEnd"]
    Apache-Handler: default-handler
    Stopwatch: 1517572162346260 98908 (- - -)
    Stopwatch2: 1517572162346260 98908; combined=33129, p1=737, p2=32228, p3=0, p4=0, p5=163, sr=0, sw=1, l=0, gc=0
    Producer: ModSecurity for Apache/2.9.2 (http://www.modsecurity.org/); CWAF_Apache.
    Server: Apache
    Engine-Mode: "DETECTION_ONLY"

    --2172df61-Z--
    """  # noqa: E501

    _AVAILABLE_SUBPARSERS = (
        _SectionAParser,
        _SectionBParser,
        _SectionCParser,
        _SectionFParser,
        _SectionHParser,
        _SectionZParser,
        _SectionNotInterestedInParser,
    )

    def __init__(self, **_):
        self._state_parser = None
        self._accumulating_result = {}
        self._accumulating_lines = []

    def feed(self, bytes_):
        self._accumulating_lines.append(bytes_)
        return self._feed_impl(bytes_)

    def _feed_impl(self, bytes_):
        line = self._adaptive_decode_bytes(bytes_.rstrip())
        if not line:
            return None

        try:
            # peek next available parser
            next_parser = next(
                parser
                for parser in self._AVAILABLE_SUBPARSERS
                if parser.is_applicable_since(line)
            )

            if next_parser is _SectionZParser:
                return self.flush()
        except StopIteration:
            if self._state_parser is None:
                raise ParseError("No parser for line %r", line)
        else:
            self._state_parser = next_parser()
            # waiting for the next feed()
            return None

        name_value_tokens = self.parse_name_value_tokens(line)
        for name, value in name_value_tokens:
            # continue accumulating result
            if name == "Message":
                self._accumulating_result.setdefault("MessageList", []).append(
                    value
                )
            # save meta data from ModSecurity itself
            # if Message data was not found before
            elif (
                name == "ModSecurity"
                and "MessageList" not in self._accumulating_result
            ):
                self._accumulating_result.setdefault("MessageList", []).append(
                    value
                )
            elif name == HEADERS:
                self._accumulating_result.setdefault(HEADERS, []).append(value)
            elif name == "Producer":
                self._parse_producer_filed(value)
            else:
                self._accumulating_result[name] = value
        return None

    def _parse_producer_filed(self, line):
        modsec_ver, vendors = get_producer_data(line)
        if modsec_ver is not None:
            self._accumulating_result["modsec_version"] = modsec_ver
        if vendors is not None:
            self._accumulating_result["vendor"] = vendors

    def parse_name_value_tokens(self, line):
        """
        Gets the result of parse_name_value_tokens and
            returns it as a list
        If ValueError excepted while message processing
            raises ParseError
        In cases when we should not interrupt parse process
            returns empty list.
        """
        try:
            return list(self._state_parser.parse_name_value_tokens(line))
        except ValueError as e:
            raise ParseError(str(e)) from e
        except (_MiscDataNotInteresting, _AmbiguousSeverity):
            # continue accumulating result
            return []

    def flush(self):
        try:
            return mk_modsec_indicent_list(
                self._accumulating_result, debug_ctx=self._accumulating_lines
            )
        finally:
            self._state_parser = None
            self._accumulating_result = {}
            self._accumulating_lines = []


class SerialLogParser(_RevolverParser):
    _SKIP_FIRST_NUM_ERRORS = 0  # don't skip errors
    _available_parsers = [
        _JsonLogParserv2,
        _JsonLogParserv3,
        _NativeLogParser,
        _JsonLogParserCoraza,
        _JsonPrettyLogParser,
    ]

    @classmethod
    def parse_file(cls, filepath):
        """do-it-all style"""
        parser = cls()
        with open(filepath, "rb") as filestream:
            for lineno, bytes_ in enumerate(filestream):
                try:
                    result = parser.feed(bytes_)
                    if result is not None:
                        # then we are done
                        return result
                except ParseError as e:
                    # return file:line in exception for better debug experience
                    raise MalformedFileError(
                        "Error in audit log file %r line %d",
                        filepath,
                        # use line 1 as base (most text editors do):
                        lineno + 1,
                    ) from e

        # return at least something
        logger.error(
            "Incomplete audit log file %r: %r",
            filepath,
            parser._current_parser._accumulating_lines,
        )
        return parser.flush()


class ConcurrentLogParser(Parser):
    def __init__(self, audit_logdir_path):
        """
        :param str audit_logdir_path: audit log dir path for ModSecurity
            concurrent mode
        """
        self._audit_logdir_path = audit_logdir_path

    def _normconcat(self, token):
        """
        :param str: token is expected to start with posixpath.sep
        """
        return self._audit_logdir_path + token

    def _directadmin_concat(self, token):
        """
        :param str: token is expected to start with posixpath.sep
        """
        return (
            self._audit_logdir_path
            + posixpath.sep
            + token.split(posixpath.sep)[1]
            + token
        )

    def feed(self, bytes_):
        try:
            str_ = bytes_.decode()
        except UnicodeDecodeError:
            # 'pass' to raise
            # ParseError("No audit log found in %r", bytes_) then
            pass
        else:
            reversed_tokens = reversed([t.strip("[]") for t in str_.split()])
            filtered_tokens = filter(os.path.isabs, reversed_tokens)
            for token, concat_fun in product(
                filtered_tokens, (self._normconcat, self._directadmin_concat)
            ):
                try:
                    log_path = None
                    token_path = concat_fun(token)
                    if os.path.isfile(token_path):  # Modsecurity v2
                        log_path = token_path
                    elif os.path.isfile(token):  # Modsecurity v3
                        log_path = token
                except (UnicodeEncodeError, ValueError):
                    # another corner case handling:
                    # 'pass' is to raise
                    # ParseError("No audit log found in %r", bytes_) then
                    pass
                else:
                    logger.debug(
                        "os.path.isfile({!r}) = {!r}".format(
                            (token_path, token), bool(log_path)
                        )
                    )
                    with suppress(FileNotFoundError):  # handle race condition
                        if log_path and os.path.getsize(log_path) > 0:
                            return SerialLogParser.parse_file(log_path)

        raise ParseError("No audit log found in %r", bytes_)


class RevolverParser(_RevolverParser):
    _available_parsers = [SerialLogParser, ConcurrentLogParser]


class _IncidentFixupList:
    class InvalidIncident(RuntimeError):
        """
        For incidents we cannot use data from, e.g.
        "Message: Rule processing failed."
        """

        pass

    @classmethod
    def apply(cls, incidents, debug_ctx):
        for incident in incidents:
            incident = cls._fixup_camel_case(incident)
            try:
                cls._fixup_msg_inplace(incident)
                cls._fixup_host_tag_inplace(incident)
                cls._fixup_severity_inplace(incident)
                cls._fixup_useragent_inplace(incident)
                yield incident
            except _IncidentFixupList.InvalidIncident:
                if "[msg " in incident["message"]:
                    cls._fixup_unparsed_message(incident)
                    yield incident

    @classmethod
    def _fixup_unparsed_message(cls, incident):
        incident["tag"] = ["noshow"]
        incident["severity"] = 7
        incident["message"] = incident["message"].partition("[msg")[2]

    @classmethod
    def _fixup_camel_case(cls, incident):
        """
        make "Host" be in the same case as "msg", "rule", etc.
        """
        return {k.lower(): v for k, v in incident.items()}

    @classmethod
    def _fixup_msg_inplace(cls, incident):
        if incident.get("msg") is None:
            raise cls.InvalidIncident()
        else:
            # to avoid KeyError: 'message' in sensor_incident_aggregate.py
            msg = incident.pop("msg")
            incident["message"] = msg
            # Try to extract 'constant' part as name
            parts = msg.split("||", maxsplit=1)
            incident["name"] = parts[0]

    @classmethod
    def _fixup_useragent_inplace(cls, incident):
        """
        fixup 'user-agent' to 'useragent' to match agent sqlitedb naming
        """
        if "user-agent" in incident:
            incident["user_agent"] = incident.pop("user-agent")

    @classmethod
    def _fixup_host_tag_inplace(cls, incident):
        """
        remove "Host: %hostname%" field duplicate in tag
        """
        if "tag" in incident:
            host = next(
                (
                    value
                    for header, value in incident.get(ADVANCED, {}).get(
                        HEADERS, []
                    )
                    if header == "Host"
                ),
                None,
            )

            if host:
                # remove "Host: %hostname%" field duplicate in tag
                incident["tag"] = [
                    tag for tag in incident["tag"] if tag != "Host: %s" % host
                ]

    @classmethod
    def _fixup_severity_inplace(cls, incident):
        """
        map modsec severity string <-> ossec modsec severity int,
        incident table expects int for severity and
        also consistent severity level is good for ML.

        github.com/SpiderLabs/ModSecurity/wiki/Reference-Manual#severity
        """
        map_ = {
            # Severe attack - No chances of false positives.
            # Immediate attention is necessary.
            "EMERGENCY": 0,
            # High importance security event.
            "ALERT": 1,
            # Multiple user generated errors
            "CRITICAL": 2,
            # First time seen
            "ERROR": 3,
            # System low priority error
            "WARNING": 4,
            # Successful/Authorized events
            "NOTICE": 5,
            # System low priority notification
            "INFO": 6,
            # - None -
            "DEBUG": 7,
        }
        severity = incident.get("severity")
        if severity is None:
            # this will be OK with imunify360.db
            return
        try:
            incident["severity"] = int(severity)  # for modsec 3.x
            return
        except ValueError:
            pass
        try:
            incident["severity"] = map_[severity]
        except KeyError:
            logger.error(
                "Cannot measure severity level for %s literal in %s plugin",
                repr(severity),
                repr(ModsecSensor.PLUGIN_ID),
            )


def mk_modsec_indicent_list(top_level_tokens, debug_ctx=None):
    """
    unroll modsec audit log into incident list

    :param top_level_tokens: see how unit test describe this data structure
    :param debug_ctx: to be shown in sentry incident
    :return list:
    """
    raw_incidents = []

    if "MessageList" in top_level_tokens:
        user_id = user_identity(
            top_level_tokens.get(ATTACKERS_IP, ""),
            dict(top_level_tokens.get(HEADERS, [])),
        )

        for message_enclosing_tokens in top_level_tokens["MessageList"]:
            incident = {
                "method": "INCIDENT",
                "plugin_id": ModsecSensor.PLUGIN_ID,
            }

            # We lookup PICK_SECAUDITLOG_FIELDS fields
            # first in "Message: [name value] [name value]" tokens
            # then in "Host: %name%", "User-Agent: %ua" top level fields
            update_dict = dict(
                (
                    field,
                    message_enclosing_tokens.get(field)
                    or top_level_tokens.get(field),
                )
                for field in PICK_SECAUDITLOG_FIELDS
                if field in message_enclosing_tokens
                or field in top_level_tokens
            )
            incident.update(update_dict)

            # generate `advanced` section
            incident[ADVANCED] = {HEADERS: top_level_tokens.get(HEADERS, [])}
            for field in URI, HTTP_METHOD:
                if top_level_tokens.get(field):
                    incident[ADVANCED][field] = top_level_tokens[field]

            incident.update({USER_IDENTITY_FIELD: user_id})
            raw_incidents.append(incident)
    result = [*_IncidentFixupList.apply(raw_incidents, debug_ctx)]
    return result


def get_anomality_score_items(msg: str) -> dict:
    result = {}
    for score_type, value in _PARSE_SCORE_REGEX.findall(msg):
        key = "%s_anomality_score" % score_type.lower()
        assert key in ANOMALITY_SCORE_FIELDS, (
            "invalid anomality score key %s detected" % key
        )
        result[key] = value
    return result


def obfuscate_if_sensitive_and_required(
    name: str, value: str
) -> Tuple[Optional[str], bool]:
    """
    Return value/obfuscated value if header is (not) sensitive
    and whether it is required
    """
    result_value = obfuscate_item_if_sensitive(name, value)
    is_required = result_value == value or ModsecSensor.SEND_ADDITIONAL_DATA
    return (result_value, is_required)


def obfuscate_item_if_sensitive(name, value) -> Optional[str]:
    """
    If header name is 'authorization' or 'cookie', then obfuscate it
    so as not to disclosure sensitive client info
    """
    name = name.lower()
    return (
        obfuscate_cookie(value)
        if name == _COOKIE
        else obfuscate_item(value)
        if name in _SENSITIVE_HEADERS
        else value
    )


def obfuscate_cookie(cookie):
    try:
        sc = SimpleCookie(cookie)
    except CookieError as e:
        value = str(e)
    else:
        value = [[k, obfuscate_item(v.value)] for k, v in sorted(sc.items())]
    return value


def _obfuscate_items(data: dict):
    result = {}
    for k, items in data.items():
        for item in items:
            result.setdefault(k, []).append(obfuscate_item(item))
    return result


def obfuscate(query):
    return _obfuscate_items(parse_qs(query))


def obfuscate_item(item: Optional[str]) -> Optional[str]:
    if not item:  # nothing to obfuscate
        return item

    # In this example we are not do urldecode for every param in query
    # We need to do it and check it always
    obf_buff = []
    pos = 0
    special_dict = {
        0x20: "[space]",
        0x09: "[tab]",
        0x0A: "[LF]",
        0x0D: "[CR]",
        0x00: "[NULL]",
    }
    item = list(item)

    while pos < len(item):
        x = item[pos]
        if ord(x) in special_dict.keys():
            obf_buff.append(special_dict[ord(x)])
            pos += 1
        else:
            if str(x).isalpha():
                start_pos = pos
                while pos < len(item):
                    x = item[pos]
                    if not str(x).isalpha():
                        break
                    pos += 1
                end_pos = pos
                if (end_pos - start_pos) <= 1:
                    obf_buff.append("[chr]")
                else:
                    obf_buff.append("[chr]{%d}" % (end_pos - start_pos))
                continue
            if str(x).isnumeric():
                start_pos = pos
                while pos < len(item):
                    x = item[pos]
                    if not str(x).isnumeric():
                        break
                    pos += 1
                end_pos = pos
                if (end_pos - start_pos) <= 1:
                    obf_buff.append("[digit]")
                else:
                    obf_buff.append("[digit]{%d}" % (end_pos - start_pos))
                continue
            obf_buff.append(x)
            pos += 1

    return "".join(obf_buff)


def get_producer_data(line: str) -> tuple:
    # ModSecurity for Apache/2.9.0 (# http://www.modsecurity.org/);
    # CWAF_Apache.
    vendors = None
    modsec_ver = None
    if ";" in line:
        version, *vendors = line.split(";")
        vendors = list(map(lambda s: s.strip(".| "), vendors))
    else:
        # ModSecurity for Apache/2.5.5
        version = line

    match = re.search(r"\d\.\d\.\d", version)
    if match:
        modsec_ver = match.group()
    return modsec_ver, vendors