%top {
/* Include this before everything else, for various large-file definitions */
#include "config.h"
#define WS_LOG_DOMAIN LOG_DOMAIN_DFILTER
#include <wireshark.h>

#include <stdlib.h>
#include <errno.h>

#include <wsutil/str_util.h>

#include "dfilter-int.h"
#include "syntax-tree.h"
#include "grammar.h"
#include "dfunctions.h"
}

/*
 * Always generate warnings.
 */
%option warn

/*
 * We want a reentrant scanner.
 */
%option reentrant

/*
 * We don't use input, so don't generate code for it.
 */
%option noinput

/*
 * We don't use unput, so don't generate code for it.
 */
%option nounput

/*
 * We don't read interactively from the terminal.
 */
%option never-interactive

/*
 * Prefix scanner routines with "df_yy" rather than "yy", so this scanner
 * can coexist with other scanners.
 */
%option prefix="df_yy"

/*
 * We're reading from a string, so we don't need yywrap.
 */
%option noyywrap

/*
 * The type for the dfs we keep for a scanner.
 */
%option extra-type="dfsyntax_t *"

%{
/*
 * Wireshark - Network traffic analyzer
 * By Gerald Combs <gerald@wireshark.org>
 * Copyright 2001 Gerald Combs
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */

/*
 * Disable diagnostics in the code generated by Flex.
 */
DIAG_OFF_FLEX()

WS_WARN_UNUSED static int set_lval_simple(dfsyntax_t *dfs, int token, const char *token_value, sttype_id_t type_id);
#define simple(token)	(update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_UNINITIALIZED))
#define test(token)	(update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_TEST))
#define math(token)	(update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_ARITHMETIC))

WS_WARN_UNUSED static int set_lval_literal(dfsyntax_t *dfs,  const char *value, const char *token_value);
WS_WARN_UNUSED static int set_lval_identifier(dfsyntax_t *dfs,  const char *value, const char *token_value);
WS_WARN_UNUSED static int set_lval_constant(dfsyntax_t *dfs,  const char *value, const char *token_value);
WS_WARN_UNUSED static int set_lval_unparsed(dfsyntax_t *dfs, const char *value, const char *token_value);

WS_WARN_UNUSED static int set_lval_field(dfsyntax_t *dfs, const header_field_info *hfinfo, const char *token_value);
WS_WARN_UNUSED static int set_lval_quoted_string(dfsyntax_t *dfs, GString *quoted_string);
WS_WARN_UNUSED static int set_lval_charconst(dfsyntax_t *dfs, GString *quoted_string);

static gboolean append_escaped_char(dfsyntax_t *dfs, GString *str, char c);
static gboolean append_universal_character_name(dfsyntax_t *dfs, GString *str, const char *ucn);
static gboolean parse_charconst(dfsyntax_t *dfs, const char *s, unsigned long *valuep);

static void update_location(dfsyntax_t *dfs, const char *text);
static void update_string_loc(dfsyntax_t *dfs, const char *text);

#define FAIL(...) \
	do { \
		ws_noisy("Scanning failed here."); \
		dfilter_fail(yyextra, DF_ERROR_GENERIC, yyextra->location, __VA_ARGS__); \
	} while (0)

%}

FunctionIdentifier	[[:alpha:]_][[:alnum:]_]*

/*
 * Cannot start with '-'. * Some protocol name can contain '-', for example "mac-lte".
 * Note that some protocol names start with a number, for example "9p". This is
 * handled as a special case for numeric patterns.
 * Some protocol names contain dots, e.g: _ws.expert
 * Protocol or protocol field cannot contain DOTDOT anywhere.
 */
VarIdentifier		[[:alpha:]_][[:alnum:]_-]*
ProtoFieldIdentifier	{VarIdentifier}(\.{VarIdentifier})*

hex2			[[:xdigit:]]{2}
ColonMacAddress		{hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2}
HyphenMacAddress 	{hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2}
DotMacAddress		{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}

hex4			[[:xdigit:]]{4}
DotQuadMacAddress 	{hex4}\.{hex4}\.{hex4}

ColonBytes		({hex2}:)|({hex2}(:{hex2})+)
HyphenBytes		{hex2}(-{hex2})+
DotBytes		{hex2}(\.{hex2})+

DecOctet		[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]
IPv4Address		{DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet}

h16			[0-9A-Fa-f]{1,4}
ls32			{h16}:{h16}|{IPv4Address}
IPv6Address		({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?::

V4CidrPrefix		\/[[:digit:]]{1,2}
V6CidrPrefix		\/[[:digit:]]{1,3}

/* Catch all valid semantic values. Cannot contain DOT DOT or start with MINUS. */
StartAlphabet		[[:alnum:]_:]
Alphabet		[[:alnum:]_:/]
LiteralValue		{StartAlphabet}{Alphabet}*(\.{Alphabet}+)*

Exponent		([eE][+-]?[[:digit:]]+)
HexExponent		([pP][+-]?[[:digit:]]+)

%x RANGE
%x LAYER
%x DQUOTE
%x SQUOTE

%%

[[:blank:]\n\r]+	{
	update_location(yyextra, yytext);
}

"("		return simple(TOKEN_LPAREN);
")"		return simple(TOKEN_RPAREN);
","		return simple(TOKEN_COMMA);
"{"		return simple(TOKEN_LBRACE);
".."		return simple(TOKEN_DOTDOT);
"}"		return simple(TOKEN_RBRACE);
"$"		return simple(TOKEN_DOLLAR);
"@"		return simple(TOKEN_ATSIGN);
"any"		return simple(TOKEN_ANY);
"all"		return simple(TOKEN_ALL);

"=="		return test(TOKEN_TEST_ANY_EQ);
"eq"		return test(TOKEN_TEST_ANY_EQ);
"any_eq"	return test(TOKEN_TEST_ANY_EQ);
"!="		return test(TOKEN_TEST_ALL_NE);
"ne"		return test(TOKEN_TEST_ALL_NE);
"all_ne"	return test(TOKEN_TEST_ALL_NE);
"==="		return test(TOKEN_TEST_ALL_EQ);
"all_eq"	return test(TOKEN_TEST_ALL_EQ);
"!=="		return test(TOKEN_TEST_ANY_NE);
"any_ne"	return test(TOKEN_TEST_ANY_NE);
">"		return test(TOKEN_TEST_GT);
"gt"		return test(TOKEN_TEST_GT);
">="		return test(TOKEN_TEST_GE);
"ge"		return test(TOKEN_TEST_GE);
"<"		return test(TOKEN_TEST_LT);
"lt"		return test(TOKEN_TEST_LT);
"<="		return test(TOKEN_TEST_LE);
"le"		return test(TOKEN_TEST_LE);
"contains"	return test(TOKEN_TEST_CONTAINS);
"~"		return test(TOKEN_TEST_MATCHES);
"matches"	return test(TOKEN_TEST_MATCHES);
"!"		return test(TOKEN_TEST_NOT);
"not"		return test(TOKEN_TEST_NOT);
"&&"		return test(TOKEN_TEST_AND);
"and"		return test(TOKEN_TEST_AND);
"||"		return test(TOKEN_TEST_OR);
"or"		return test(TOKEN_TEST_OR);
"^^"		return test(TOKEN_TEST_XOR);
"xor"		return test(TOKEN_TEST_XOR);
"in"		return test(TOKEN_TEST_IN);

"+"		return math(TOKEN_PLUS);
"-"		return math(TOKEN_MINUS);
"*"		return math(TOKEN_STAR);
"/"		return math(TOKEN_RSLASH);
"%"		return math(TOKEN_PERCENT);
"&"		return math(TOKEN_BITWISE_AND);
"bitwise_and"	return math(TOKEN_BITWISE_AND);

"#"				{
	BEGIN(LAYER);
	return simple(TOKEN_HASH);
}

<LAYER>[[:digit:]]+		{
	BEGIN(INITIAL);
	update_location(yyextra, yytext);
	return set_lval_simple(yyextra, TOKEN_INTEGER, yytext, STTYPE_UNINITIALIZED);
}

<LAYER>[^[:digit:][]			{
	update_location(yyextra, yytext);
	FAIL("Expected digit or \"[\", not \"%s\"", yytext);
	return SCAN_FAILED;
}

<INITIAL,LAYER>"["		{
	BEGIN(RANGE);
	return simple(TOKEN_LBRACKET);
}

<RANGE>[^],]+			{
	update_location(yyextra, yytext);
	return set_lval_simple(yyextra, TOKEN_RANGE_NODE, yytext, STTYPE_UNINITIALIZED);
}

<RANGE>","			{
	return simple(TOKEN_COMMA);
}

<RANGE>"]"				{
	BEGIN(INITIAL);
	return simple(TOKEN_RBRACKET);
}

<RANGE><<EOF>>				{
	update_location(yyextra, yytext);
	FAIL("The right bracket was missing from a slice.");
	return SCAN_FAILED;
}

[rR]{0,1}\042			{
	/* start quote of a quoted string */
	/*
	 * The example of how to scan for strings was taken from
	 * the flex manual, from the section "Start Conditions".
	 * See: https://westes.github.io/flex/manual/Start-Conditions.html
	 */
	BEGIN(DQUOTE);
	update_location(yyextra, yytext);
	yyextra->string_loc = yyextra->location;

	yyextra->quoted_string = g_string_new(NULL);

	if (yytext[0] == 'r' || yytext[0] == 'R') {
		/*
		 * This is a raw string (like in Python). Rules: 1) The two
		 * escape sequences are \\ and \". 2) Backslashes are
		 * preserved. 3) Double quotes in the string must be escaped.
		 * Corollary: Strings cannot end with an odd number of
		 * backslashes.
		 * Example: r"a\b\x12\"\\" is the string (including the implicit NUL terminator)
		 * {'a', '\\', 'b', '\\', 'x', '1', '2', '\\', '"', '\\'. '\\', '\0'}
		 */
		yyextra->raw_string = TRUE;
	}
	else {
		yyextra->raw_string = FALSE;
	}
}

<DQUOTE><<EOF>>				{
	/* unterminated string */
	update_string_loc(yyextra, yytext);
	g_string_free(yyextra->quoted_string, TRUE);
	yyextra->quoted_string = NULL;
	FAIL("The final quote was missing from a quoted string.");
	return SCAN_FAILED;
}

<DQUOTE>\042			{
	/* end quote */
	BEGIN(INITIAL);
	update_string_loc(yyextra, yytext);
	int token = set_lval_quoted_string(yyextra, yyextra->quoted_string);
	yyextra->quoted_string = NULL;
	yyextra->string_loc.col_start = -1;
	return token;
}

<DQUOTE>\\[0-7]{1,3} {
	/* octal sequence */
	update_string_loc(yyextra, yytext);
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else {
		unsigned long result;
		result = strtoul(yytext + 1, NULL, 8);
		if (result > 0xff) {
			g_string_free(yyextra->quoted_string, TRUE);
			yyextra->quoted_string = NULL;
			FAIL("%s is larger than 255.", yytext);
			return SCAN_FAILED;
		}
		g_string_append_c(yyextra->quoted_string, (gchar) result);
	}
}

<DQUOTE>\\x[[:xdigit:]]{1,2} {
	/* hex sequence */
	/*
	 * C standard does not place a limit on the number of hex
	 * digits after \x... but we do. \xNN can have 1 or two Ns, not more.
	 */
	update_string_loc(yyextra, yytext);
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else {
		unsigned long result;
		result = strtoul(yytext + 2, NULL, 16);
		g_string_append_c(yyextra->quoted_string, (gchar) result);
	}
}

<DQUOTE>\\u[[:xdigit:]]{0,4} {
	/* universal character name */
	update_string_loc(yyextra, yytext);
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else if (!append_universal_character_name(yyextra, yyextra->quoted_string, yytext)) {
		g_string_free(yyextra->quoted_string, TRUE);
		yyextra->quoted_string = NULL;
		return SCAN_FAILED;
	}
}

<DQUOTE>\\U[[:xdigit:]]{0,8} {
	/* universal character name */
	update_string_loc(yyextra, yytext);
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else if (!append_universal_character_name(yyextra, yyextra->quoted_string, yytext)) {
		g_string_free(yyextra->quoted_string, TRUE);
		yyextra->quoted_string = NULL;
		return SCAN_FAILED;
	}
}


<DQUOTE>\\.				{
	/* escaped character */
	update_string_loc(yyextra, yytext);
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else if (!append_escaped_char(yyextra, yyextra->quoted_string, yytext[1])) {
		g_string_free(yyextra->quoted_string, TRUE);
		yyextra->quoted_string = NULL;
		return SCAN_FAILED;
	}
}

<DQUOTE>[^\\\042]+			{
	/* non-escaped string */
	update_string_loc(yyextra, yytext);
	g_string_append(yyextra->quoted_string, yytext);
}


\047				{
	/* start quote of a quoted character value */
	BEGIN(SQUOTE);
	update_location(yyextra, yytext);
	yyextra->string_loc = yyextra->location;

	yyextra->quoted_string = g_string_new("'");
}

<SQUOTE><<EOF>>				{
	/* unterminated character value */
	update_string_loc(yyextra, yytext);
	g_string_free(yyextra->quoted_string, TRUE);
	yyextra->quoted_string = NULL;
	FAIL("The final quote was missing from a character constant.");
	return SCAN_FAILED;
}

<SQUOTE>\047			{
	/* end quote */
	BEGIN(INITIAL);
	update_string_loc(yyextra, yytext);
	g_string_append_c(yyextra->quoted_string, '\'');
	int token = set_lval_charconst(yyextra, yyextra->quoted_string);
	yyextra->quoted_string = NULL;
	yyextra->string_loc.col_start = -1;
	return token;
}

<SQUOTE>\\.				{
	/* escaped character */
	update_string_loc(yyextra, yytext);
	g_string_append(yyextra->quoted_string, yytext);
}

<SQUOTE>[^\\\047]+			{
	/* non-escaped string */
	update_string_loc(yyextra, yytext);
	g_string_append(yyextra->quoted_string, yytext);
}

	/* NOTE: None of the patterns below can match ".." anywhere in the token string. */

	/* MAC address. */

{ColonMacAddress}|{HyphenMacAddress}	{
	/* MAC Address. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

{DotMacAddress}|{DotQuadMacAddress}	{
	/* MAC Address, can also be a field. */
	update_location(yyextra, yytext);
	return set_lval_unparsed(yyextra, yytext, yytext);
}

	/* IP address. */

{IPv4Address}{V4CidrPrefix}?		{
	/* IPv4 with or without prefix. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

{IPv6Address}{V6CidrPrefix}?		{
	/* IPv6 with or without prefix. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

	/* Integer */

[[:digit:]][[:digit:]]*	{
	/* Numeric or field. */
	update_location(yyextra, yytext);
	/* Check if we have a protocol or protocol field, otherwise assume a literal. */
	/* It is only reasonable to assume a literal here, instead of a
	 * (possibly non-existant) protocol field, because protocol field filter names
	 * should not start with a digit (the lexical syntax for numbers). */
	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra, yytext);
	if (hfinfo != NULL) {
		return set_lval_field(yyextra, hfinfo, yytext);
	}
	return set_lval_literal(yyextra, yytext, yytext);
}

0[bBxX]?[[:xdigit:]]+	{
	/* Binary or octal or hexadecimal. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

	/* Floating point. */

[[:digit:]]+{Exponent}|[[:digit:]]+\.[[:digit:]]+{Exponent}?	{
	/* Decimal float with optional exponent. */
	/* Significand cannot have any side omitted. */
	update_location(yyextra, yytext);
	/* Check if we have a protocol or protocol field, otherwise assume a literal. */
	/* It is only reasonable to assume a literal here, instead of a
	 * (possibly non-existant) protocol field, because protocol field filter names
	 * should not start with a digit (the lexical syntax for numbers). */
	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra, yytext);
	if (hfinfo != NULL) {
		return set_lval_field(yyextra, hfinfo, yytext);
	}
	return set_lval_literal(yyextra, yytext, yytext);
}

0[xX][[:xdigit:]]+{HexExponent}|0[xX][[:xdigit:]]+\.[[:xdigit:]]+{HexExponent}?	{
	/* Hexadecimal float with optional exponent. Can't be a field because
	 * field cannot beging with 0x. */
	/* Significand cannot have any side omitted. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra,  yytext, yytext);
}

:[[:xdigit:]]+	{
	/* Numeric prefixed with ':'. */
	update_location(yyextra, yytext);
	return set_lval_literal(yyextra, yytext + 1, yytext);
}

[[:xdigit:]]+	{
	/* Numeric or field. */
	update_location(yyextra, yytext);
	return set_lval_unparsed(yyextra, yytext, yytext);
}

	/* Bytes. */

:?{ColonBytes}	{
	/* Bytes. */
	update_location(yyextra, yytext);
	if (yytext[0] == ':')
		return set_lval_literal(yyextra, yytext + 1, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

:?{HyphenBytes}	{
	/* Bytes. */
	update_location(yyextra, yytext);
	if (yytext[0] == ':')
		return set_lval_literal(yyextra, yytext + 1, yytext);
	return set_lval_literal(yyextra, yytext, yytext);
}

:?{DotBytes}	{
	/* DotBytes, can be a field without ':' prefix. */
	update_location(yyextra, yytext);
	if (yytext[0] == ':')
		return set_lval_literal(yyextra, yytext + 1, yytext);
	return set_lval_unparsed(yyextra, yytext, yytext);
}

	/* Identifier (protocol/field/function name). */

	/* This must come before FieldIdentifier to match function names. */
{FunctionIdentifier}	{
	/* Identifier (field or function) or constant (bytes without separator). */
	/* We use CONSTANT instead of LITERAL because the difference is significant
	 * in the syntactical grammar. */
	update_location(yyextra, yytext);
	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra, yytext);
	if (hfinfo != NULL) {
		return set_lval_identifier(yyextra, yytext, yytext);
	}
        df_func_def_t *def = df_func_lookup(yytext);
	if (def != NULL) {
		return set_lval_identifier(yyextra, yytext, yytext);
	}
	return set_lval_constant(yyextra, yytext, yytext);
}

\.{ProtoFieldIdentifier}	{
	/* Identifier, prefixed with a '.'. */
	update_location(yyextra, yytext);
	const char *name = yytext + 1;
	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra, name);
	if (hfinfo == NULL) {
		FAIL("\"%s\" is not a valid protocol or protocol field.", name);
		return SCAN_FAILED;
	}
	return set_lval_field(yyextra, hfinfo, yytext);
}

{ProtoFieldIdentifier}	{
	/* Catch-all for protocol values. Can also be a literal. */
	update_location(yyextra, yytext);
	return set_lval_identifier(yyextra, yytext, yytext);
}

{LiteralValue}	{
	/* Catch-all for semantic values. */
	update_location(yyextra, yytext);
	/* We use literal here because identifiers (using unparsed) should have
	 * matched one of the previous rules. */
	return set_lval_literal(yyextra, yytext, yytext);
}

. {
	/* Default */
	update_location(yyextra, yytext);
	if (isprint_string(yytext))
		FAIL("\"%s\" was unexpected in this context.", yytext);
	else
		FAIL("Non-printable ASCII characters may only appear inside double-quotes.");
	return SCAN_FAILED;
}

%%

/*
 * Turn diagnostics back on, so we check the code that we've written.
 */
DIAG_ON_FLEX()

static void
_update_location(dfsyntax_t *dfs, size_t len)
{
	dfs->location.col_start += (long)dfs->location.col_len;
	dfs->location.col_len = len;
}

static void
update_location(dfsyntax_t *dfs, const char *text)
{
	_update_location(dfs, strlen(text));
}

static void
update_string_loc(dfsyntax_t *dfs, const char *text)
{
	size_t len = strlen(text);
	dfs->string_loc.col_len += len;
	_update_location(dfs, len);
}

static int
set_lval_simple(dfsyntax_t *dfs, int token, const char *token_value, sttype_id_t type_id)
{
	dfs->lval = stnode_new(type_id, NULL, g_strdup(token_value), dfs->location);
	return token;
}

static int
set_lval_literal(dfsyntax_t *dfs, const char *value, const char *token_value)
{
	dfs->lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), dfs->location);
	return TOKEN_LITERAL;
}

static int
set_lval_identifier(dfsyntax_t *dfs, const char *value, const char *token_value)
{
	dfs->lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), dfs->location);
	return TOKEN_IDENTIFIER;
}

static int
set_lval_constant(dfsyntax_t *dfs, const char *value, const char *token_value)
{
	dfs->lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), dfs->location);
	return TOKEN_CONSTANT;
}

static int
set_lval_unparsed(dfsyntax_t *dfs, const char *value, const char *token_value)
{
	int token;
	const header_field_info *hfinfo = dfilter_resolve_unparsed(dfs, value);
	if (hfinfo != NULL) {
		token = set_lval_field(dfs, hfinfo, token_value);
    	}
	else {
		token = set_lval_literal(dfs, value, token_value);
	}
	stnode_set_flags(dfs->lval, STFLAG_UNPARSED);
	return token;
}

static int
set_lval_field(dfsyntax_t *dfs, const header_field_info *hfinfo, const char *token_value)
{
	dfs->lval = stnode_new(STTYPE_FIELD, (gpointer)hfinfo, g_strdup(token_value), dfs->location);
	return TOKEN_FIELD;
}

static int
set_lval_quoted_string(dfsyntax_t *dfs, GString *quoted_string)
{
	char *token_value;

	token_value = ws_escape_string_len(NULL, quoted_string->str, quoted_string->len, true);
	dfs->lval = stnode_new(STTYPE_STRING, quoted_string, token_value, dfs->string_loc);
	return TOKEN_STRING;
}

static int
set_lval_charconst(dfsyntax_t *dfs, GString *quoted_string)
{
	unsigned long number;
	gboolean ok;

	char *token_value = g_string_free(quoted_string, FALSE);
	ok = parse_charconst(dfs, token_value, &number);
	if (!ok) {
		g_free(token_value);
		return SCAN_FAILED;
	}
	dfs->lval = stnode_new(STTYPE_CHARCONST, g_memdup2(&number, sizeof(number)), token_value, dfs->string_loc);
	return TOKEN_CHARCONST;
}

static gboolean
append_escaped_char(dfsyntax_t *dfs, GString *str, char c)
{
	switch (c) {
		case 'a':
			c = '\a';
			break;
		case 'b':
			c = '\b';
			break;
		case 'f':
			c = '\f';
			break;
		case 'n':
			c = '\n';
			break;
		case 'r':
			c = '\r';
			break;
		case 't':
			c = '\t';
			break;
		case 'v':
			c = '\v';
			break;
		case '\\':
		case '\'':
		case '\"':
			break;
		default:
			dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->location,
					"\\%c is not a valid character escape sequence", c);
			return FALSE;
	}

	g_string_append_c(str, c);
	return TRUE;
}

static gboolean
parse_universal_character_name(dfsyntax_t *dfs _U_, const char *str, char **ret_endptr, gunichar *valuep)
{
	guint64 val;
	char *endptr;
	int ndigits;

	if (str[0] != '\\')
		return FALSE;

	if (str[1] == 'u')
		ndigits = 4;
	else if (str[1] == 'U')
		ndigits = 8;
	else
		return FALSE;

	for (int i = 2; i < ndigits + 2; i++) {
		if (!g_ascii_isxdigit(str[i])) {
			return FALSE;
		}
	}

	errno = 0;
	val = g_ascii_strtoull(str + 2, &endptr, 16); /* skip leading 'u' or 'U' */

	if (errno != 0 || endptr == str || val > G_MAXUINT32) {
		return FALSE;
	}

	/*
	 * Ref: https://en.cppreference.com/w/c/language/escape
	 * Range of universal character names
	 *
	 * If a universal character name corresponds to a code point that is
	 * not 0x24 ($), 0x40 (@), nor 0x60 (`) and less than 0xA0, or a
	 * surrogate code point (the range 0xD800-0xDFFF, inclusive), or
	 * greater than 0x10FFFF, i.e. not a Unicode code point (since C23),
	 * the program is ill-formed. In other words, members of basic source
	 * character set and control characters (in ranges 0x0-0x1F and
	 * 0x7F-0x9F) cannot be expressed in universal character names.
	 */
	if (val < 0xA0 && val != 0x24 && val != 0x40 && val != 0x60)
		return FALSE;
	else if (val >= 0xD800 && val <= 0xDFFF)
		return FALSE;
	else if (val > 0x10FFFF)
		return FALSE;

	*valuep = (gunichar)val;
	if (ret_endptr)
		*ret_endptr = endptr;
	return TRUE;
}

static gboolean
append_universal_character_name(dfsyntax_t *dfs, GString *str, const char *ucn)
{
	gunichar val;

	if (!parse_universal_character_name(dfs, ucn, NULL, &val)) {
		dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->location, "%s is not a valid universal character name", ucn);
		return FALSE;
	}

	g_string_append_unichar(str, val);
	return TRUE;
}

static gboolean
parse_charconst(dfsyntax_t *dfs, const char *s, unsigned long *valuep)
{
	const char *cp;
	unsigned long value;
	gunichar unival;
	char *endptr;

	cp = s + 1;	/* skip the leading ' */
	if (*cp == '\'') {
		dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "Empty character constant.");
		return FALSE;
	}

	if (*cp == '\\') {
		/*
		 * C escape sequence.
		 * An escape sequence is an octal number \NNN,
		 * an hex number \xNN, or one of \' \" \\ \a \b \f \n \r \t \v
		 * that stands for the byte value of the equivalent
		 * C-escape in ASCII encoding.
		 */
		cp++;
		switch (*cp) {

		case '\0':
			dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
			return FALSE;

		case 'a':
			value = '\a';
			cp++;
			break;

		case 'b':
			value = '\b';
			cp++;
			break;

		case 'f':
			value = '\f';
			cp++;
			break;

		case 'n':
			value = '\n';
			break;

		case 'r':
			value = '\r';
			cp++;
			break;

		case 't':
			value = '\t';
			cp++;
			break;

		case 'v':
			value = '\v';
			cp++;
			break;

		case '\'':
			value = '\'';
			cp++;
			break;

		case '\\':
			value = '\\';
			cp++;
			break;

		case '"':
			value = '"';
			cp++;
			break;

		case 'x':
			cp++;
			if (*cp >= '0' && *cp <= '9')
				value = *cp - '0';
			else if (*cp >= 'A' && *cp <= 'F')
				value = 10 + (*cp - 'A');
			else if (*cp >= 'a' && *cp <= 'f')
				value = 10 + (*cp - 'a');
			else {
				dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
				return FALSE;
			}
			cp++;
			if (*cp != '\'') {
				value <<= 4;
				if (*cp >= '0' && *cp <= '9')
					value |= *cp - '0';
				else if (*cp >= 'A' && *cp <= 'F')
					value |= 10 + (*cp - 'A');
				else if (*cp >= 'a' && *cp <= 'f')
					value |= 10 + (*cp - 'a');
				else {
					dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
					return FALSE;
				}
			}
			cp++;
			break;

		case 'u':
		case 'U':
			if (!parse_universal_character_name(dfs, s+1, &endptr, &unival)) {
				dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is not a valid universal character name", s);
				return FALSE;
			}
			value = (unsigned long)unival;
			cp = endptr;
			break;

		default:
			/* Octal */
			if (*cp >= '0' && *cp <= '7')
				value = *cp - '0';
			else {
				dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
				return FALSE;
			}
			if (*(cp + 1) != '\'') {
				cp++;
				value <<= 3;
				if (*cp >= '0' && *cp <= '7')
					value |= *cp - '0';
				else {
					dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
					return FALSE;
				}
				if (*(cp + 1) != '\'') {
					cp++;
					value <<= 3;
					if (*cp >= '0' && *cp <= '7')
						value |= *cp - '0';
					else {
						dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
						return FALSE;
					}
				}
			}
			if (value > 0xFF) {
				dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is too large to be a valid character constant.", s);
				return FALSE;
			}
			cp++;
		}
	} else {
		value = *cp++;
		if (!g_ascii_isprint(value)) {
			dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "Non-printable value '0x%02lx' in character constant.", value);
			return FALSE;
		}
	}

	if ((*cp != '\'') || (*(cp + 1) != '\0')){
		dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is too long to be a valid character constant.", s);
		return FALSE;
	}

	*valuep = value;
	return TRUE;
}
