HEX

File: //usr/local/modsecurity-crs/regex-assembly/include/charset-specification-no-anchors.ra
##! Please refer to the documentation at
##! https://coreruleset.org/docs/development/regex_assembly/.

##! The expression generated from this file matches a full HTTP
##! `Accept` header by following the specification as far as
##! necessary, while restricting the `charset` parameter
##! to a list of explicitly allowed values.
##! Where possible, the expression matches tokens "loosely",
##! to reduce complexity and the risk of false positives.

##! As this specification is used by several rules, we now
##! store it as an include file.

##! Specifications:
##! https://httpwg.org/specs/rfc7231.html#request.conneg
##! https://httpwg.org/specs/rfc7230.html

##! Helpers
##!> define non-token-without-dquote-chars (),/:;<=>?![\x5c\]{}

##!> define non-token-chars \"{{non-token-without-dquote-chars}}

##!> define token-chars [^{{non-token-chars}}]

##!> define token-with-dquote-chars [^{{non-token-without-dquote-chars}}]

##!> define type-subtype (?:\*|{{token-chars}}+)

##! The specification does not allow `*` in place of `*/*` but
##! enough clients use `*` for it to be an issue. Thus, it is
##! explicitly allowed here.
##!> define media-type (?:(?:{{type-subtype}}/{{type-subtype}})|\*)

##! list of allowed charsets
##!> assemble
  (?:"?
  ##!=>
  ##!> include allowed-charsets
  ##!=>
  \b"?))
  ##!=< allowed-charsets
##!<


##! Main assembly
##!> assemble
  (?:{{media-type}})
  ##!=>
  (?:\s*;\s*
  ##!=>
  (?:(?:charset\s*=\s*
  ##!=>
  ##!=> allowed-charsets

  ##! If the first part wasn't a "charset", then
  ##! anything is allowed here that is not "charset".
  ##! Note that this doesn't follow the RFC strictly.
  |(?:
  ##!=>
  ##! Do not match space, otherwise the following would be possible:
  ##! "text/html; charset=invalid"
  ##! `charset` would be matched by `{{token-chars}}`
  [^c\s{{non-token-chars}}]{{token-chars}}*
  c[^h{{non-token-chars}}]{{token-chars}}*
  ch[^a{{non-token-chars}}]{{token-chars}}*
  cha[^r{{non-token-chars}}]{{token-chars}}*
  char[^s{{non-token-chars}}]{{token-chars}}*
  chars[^e{{non-token-chars}}]{{token-chars}}*
  charse[^t{{non-token-chars}}]{{token-chars}}*
  ##!=>
  )\s*=\s*{{token-with-dquote-chars}}+)
  ##!=>
  ##! Clients like to violate the RFC, be lenient with
  ##! terminating semi-colons.
  ;?
  ##!=>
  )*
  ##!=>

  ##! Multiple "media-range" expressions can be
  ##! specified, comma separated.
  (?:\s*,\s*
  ##!=>
  (?:{{media-type}})
  ##!=>
  (?:\s*;\s*
  ##!=>
  (?:(?:charset\s*=\s*
  ##!=>
  ##!=> allowed-charsets
  |(?:
  ##!=>
  ##! Do not match space, otherwise the following would be possible:
  ##! "text/html; charset=invalid"
  ##! `charset` would be matched by `{{token-chars}}`
  [^c\s{{non-token-chars}}]{{token-chars}}*
  c[^h{{non-token-chars}}]{{token-chars}}*
  ch[^a{{non-token-chars}}]{{token-chars}}*
  cha[^r{{non-token-chars}}]{{token-chars}}*
  char[^s{{non-token-chars}}]{{token-chars}}*
  chars[^e{{non-token-chars}}]{{token-chars}}*
  charse[^t{{non-token-chars}}]{{token-chars}}*
  ##!=>
  )\s*=\s*{{token-with-dquote-chars}}+)
  ##!=>
  ##! Clients like to violate the RFC, be lenient with
  ##! terminating semi-colons.
  ;?
  ##!=>
  )*)*
  ##!=>
##!<