/* $Id$ */

/**
 * \file lexer.c
 * bogofilter's lexical analyzer (control routines)
 *
 * \date 2003-01-01 split out of lexer.l
 */

#include "common.h"

#include <ctype.h>
#include <stdlib.h>
#include <assert.h>
#include <iconv.h>
#include <limits.h>
#include <string.h>
#include <sys/types.h>
#include "buff.h"
#include "debug.h"

#include "base64.h"
#include "bogoreader.h"
#include "charset.h"
#include "error.h"
#ifndef	DISABLE_UNICODE
#include "convert_unicode.h"
#include "iconvert.h"
#endif
#include "lexer.h"
#include "memstr.h"
#include "mime.h"
#include "msgcounts.h"
#include "qp.h"
#include "textblock.h"
#include "token.h"
#include "word.h"
#include "xmalloc.h"

/* Global Variables */

extern int yylineno;
extern int yyleng;
extern char *yytext;

bool msg_header = true;
bool have_body  = false;
lexer_t *lexer = NULL;

/* Local Variables */

static lexer_t v3_lexer = {
    yylex,
    lexer_v3_get_token
};

lexer_t msg_count_lexer = {
    read_msg_count_line,
    msg_count_get_token
};

/* Function Prototypes */

static int yy_get_new_line(buff_t *buff);
static int get_decoded_line(buff_t *buff);
static int skip_folded_line(buff_t *buff);

/* Function Definitions */

void lexer_init(void)
{
    mime_reset();
    token_init();
    lexer_v3_init(NULL);
    init_charset_table(charset_default);
    (void)get_decoded_line(NULL);
}

static void lexer_display_buffer(buff_t *buff)
{
    fprintf(dbgout, "*** %2d %c%c %2ld ",
	    yylineno-1, msg_header ? 'h' : 'b', yy_get_state(),
	    (long)(buff->t.leng - buff->read));
    buff_puts_escaped(buff, 0, dbgout);
    // if (buff->t.leng > 0 && buff->t.u.text[buff->t.leng-1] != '\n') 
    {
	fputc('\n', dbgout);
    }
}

/**
 * Check for lines wholly composed of printable characters as they can
 * cause a scanner abort "input buffer overflow, can't enlarge buffer
 * because scanner uses REJECT"
 *
 * \bug this function must go, we need to fix the lexer
 */
static bool long_token(byte *buf, uint count)
{
    uint i;
    for (i=0; i < count; i += 1) {
	byte c = buf[i];
	/* 10/23/05 - fix SIGSEGV with msg.1023.6479.txt
	** evidently caused by 09/07/05 patch for 0.96.2
	*/
	if (c == '\0')
	    break;
	if ((iscntrl(c) || isspace(c) || ispunct(c)) && (c != '_'))
	    return false;
    }
    return true;
}

static int yy_get_new_line(buff_t *buff)
{
    int count = (*reader_getline)(buff);
    const byte *buf = buff->t.u.text;
    if (DEBUG_LEXER(4)) {
	    fprintf(dbgout, "*** READER INPUT: %-.*s (buff_read %d) ***\n", max(0, count - (int)buff->read), buff->t.u.ctext + buff->read, (int)buff->read); 
    }

    static size_t hdrlen = 0;
    if (hdrlen==0)
	hdrlen=strlen(spam_header_name);

    if (count > 0)
	yylineno += 1;

    if (count == EOF) {
	if (fpin == NULL || !ferror(fpin)) {
	    return YY_NULL;
	}
	else {
	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	    exit(EX_ERROR);
	}
    }

    /* Mime header check needs to be performed on raw input
    ** -- before mime decoding.  Without it, flex aborts:
    ** "fatal flex scanner internal error--end of buffer missed" */

    if (buff->t.leng - buff->read > 2 &&
	    buf[buff->read] == '-' && buf[buff->read + 1] == '-')
    {
	word_t potentialboundary;
	potentialboundary.leng = buff->t.leng - buff->read;
	potentialboundary.u.text = buff->t.u.text + buff->read;

	if (DEBUG_MIME(3)) {
	    fprintf(dbgout, "*** CHECK BOUNDARY: ");
	    word_puts_escaped(&potentialboundary, 0, dbgout);
	    fprintf(dbgout, "***\n");
	    if (buff->read) {
		fprintf(dbgout, "^^^ Unread part before boundary:");
		word_t x;
		x.u.text = buff->t.u.text;
		x.leng = buff->read;
		word_puts_escaped(&x, 0, dbgout);
		fprintf(dbgout, " ^^^\n");
	    }
	}

	if (mime_is_boundary(&potentialboundary)) {
	    /* strip unread stuff and reset status so it gets
	     * recognized as boundary */
	    buff_shift(buff, 0, buff->read);
	    set_bol(1); // set beginning-of-line so that ^ regexp anchor matches.
	}
    }

    if (count != EOF && DEBUG_LEXER(0))
	lexer_display_buffer(buff);

    /* skip spam_header ("X-Bogosity:") lines */
    while (msg_header
	   && count != EOF
	   /* don't skip if inside message/rfc822 */
	   && msg_state->parent == NULL
	   && buff->t.leng >= hdrlen
	   && memcmp(buff->t.u.text,spam_header_name,hdrlen) == 0) {
	count = skip_folded_line(buff);
    }

    return count;
}

static int get_decoded_line(buff_t *buff /**< output buffer for a MIME-decoded and UNICODE-transformed line; if NULL, releases internal buffer */)
/*  RFC2047.2
    encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
    charset = token    ; see section 3
    encoding = token   ; see section 4
    token = 1*<Any CHAR except SPACE, CTLs, and especials>
    especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
		<"> / "/" / "[" / "]" / "?" / "." / "="
    encoded-text = 1*<Any printable ASCII character other than "?"
		      or SPACE>
		   ; (but see "Use of encoded-words in message
		   ; headers", section 5)
*/
{
    static buff_t tempbuff;
    int count;
    int c;
    buff_t *linebuff;

    /* special case if buff == NULL: clear tempbuff */
    if (NULL == buff) {
	xfree(tempbuff.t.u.text);
	memset(&tempbuff, 0, sizeof(tempbuff));
	return 0;	
    }

    /* since msg_state might change during calls */
    bool mime_dont_decode = msg_state->mime_dont_decode;

#ifdef	DISABLE_UNICODE
    linebuff = buff;
#else
    if (encoding != E_UNICODE ||
	msg_state->mime_dont_decode) {
	linebuff = buff;
    } else {
        /* move remainder of buffer to the beginning */
        if (tempbuff.t.leng && tempbuff.t.u.text)
                buff_shift(&tempbuff, 0, tempbuff.read);

	/* UTF-8 uses up to four octets per character.  
           Size temp buffer sufficiently */
	if (tempbuff.size < buff->size / 4 || !tempbuff.t.u.text) {
	    tempbuff.size = buff->size / 4 + 16; /* add some spare room */
            tempbuff.t.u.text = (byte *)xrealloc(tempbuff.t.u.text, tempbuff.size + D);
	}

	linebuff = &tempbuff;
    }
#endif

    /* note that this call might invoke got_mimeboundary() thus
     * changing the global msg_state variable */
    count = yy_get_new_line(linebuff);

    if (count == EOF) {
	if ( !ferror(fpin))
	    return YY_NULL;
	else {
	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	    exit(EX_ERROR);
	}
    }

    if (passthrough && count > 0)
	textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count);

    if (msg_header) {
	int oread = linebuff->read;

	do {
	    int add;

	    /* in headers, peek at the next character to see if we need to fetch another
	     * line to unfold headers */
	    c = getc(fpin);
	    if (c == EOF) break;
	    ungetc(c, fpin);
	    if (isblank(c)) {
		add = yy_get_new_line(linebuff);
                if (add <= 0)
		    break;
                if (passthrough)
                    textblock_add(linebuff->t.u.text+linebuff->read, (size_t) add);            
                count += add;
            }
        } while (isblank(c));

	linebuff->read = oread;
        if (DEBUG_LEXER(3)) {
            fputs(">>> resulting unfolded header >>> ", dbgout);
            buff_puts_escaped(linebuff, 0, dbgout);
            fputs("\n", dbgout);
        }
    }

    /* Save the text on a linked list of lines.
     * Note that we store fixed-length blocks here, not lines.
     * One very long physical line could break up into more
     * than one of these. */

    if (msg_header) {
	/* Try RFC-2047 decoder on everything */
	word_t temp, *res;

	temp.leng = (uint)count;
	temp.u.text = linebuff->t.u.text+linebuff->read;

	if (DEBUG_LEXER(2)) {
	    fprintf(dbgout, "before: ");
	    lexer_display_buffer(linebuff);
	}
	res = text_decode(&temp);
	if (res != &temp) {
	    memcpy(linebuff->t.u.text+linebuff->read,
		    res->u.text, res->leng);
	}
	linebuff->t.leng -= (uint)(count - res->leng);
	count = res->leng;
	if (DEBUG_LEXER(2)) {
	    fprintf(dbgout, "after:  ");
	    lexer_display_buffer(linebuff);
	}
    }

    if ( !msg_header && 
	 !mime_dont_decode &&
	 msg_state->mime_type != MIME_TYPE_UNKNOWN)
    {
	word_t temp;
	uint decoded_count;

	temp.leng = (uint) count;
	temp.u.text = linebuff->t.u.text+linebuff->read;

	decoded_count = mime_decode(&temp);
	/*change buffer size only if the decoding worked */
	if (decoded_count != 0 && decoded_count < (uint) count) {
	    linebuff->t.leng -= (uint) (count - decoded_count);
	    count = (int) decoded_count;
	    if (DEBUG_LEXER(1))
		lexer_display_buffer(linebuff);
	}
    }

#ifndef	DISABLE_UNICODE
    if (encoding == E_UNICODE &&
	!mime_dont_decode &&
        count > 0)
    {
        if (DEBUG_ICONV(3))
                fprintf(dbgout,  "### iconvert buffers: linebuff %u, read %u, buff %u\n", linebuff->size, linebuff->read, buff->size),
		lexer_display_buffer(linebuff);
	linebuff->read = 0; // reset read counter to include bytes leftover from previous call that failed to convert
	if (DEBUG_ICONV(3))
		lexer_display_buffer(linebuff);
        if (msg_header) { // in headers, the RFC-2047 decoder has already converted, don't do it again. -1 means identity "conversion".
            iconvert_cd((iconv_t)-1, linebuff, buff);
        } else {
            iconvert(linebuff, /* destination */ buff);
        }
        if (DEBUG_ICONV(3))
                fprintf(dbgout,  "### iconvert outputs: linebuff %u, read %u, buff %u, buff->read %u, buff->t.leng %u\n",
			linebuff->size, linebuff->read, buff->size, buff->read, buff->t.leng),
		lexer_display_buffer(linebuff),
		lexer_display_buffer(buff);

	/* If we return count = 0 here, the caller will think we have
	 * no more bytes left to read, even though before the iconvert
	 * call we had a positive number of bytes. This *will* lead to
	 * a message truncation which we try to avoid by simply
	 * returning another in-band error code. */
	if (buff->t.leng == 0) {
	    count = -2;
        } else {
	    /* iconvert, treating multi-byte sequences, can shrink or enlarge
	     * the output compared to its input.  Correct count. */
	    count = buff->t.leng;
	}
    }
#endif

#ifdef EXCESSIVE_DEBUG
    /* debug */
    fprintf(dbgout, "%d: ", count);
    buff_puts(buff, 0, dbgout);
    fprintf(dbgout, "\n");
#endif

    /* CRLF -> NL */
    if (count >= 2) {
	byte *buf = buff->t.u.text;
	if (memcmp(buf + count - 2, CRLF, 2) == 0) {
	    count --;
	    --buff->t.leng;
	    *(buf + count - 1) = (byte) '\n';
	}
    }

    if (yyleng + max(0, count) > 1000) {
	/* token growing too long and we're going to discard it anyhow. 
	   Add a blank character, or replace at the end of a string. */
	if (buff->t.leng < buff->size) {
	    buff->t.u.text[buff->t.leng] = ' ';
	    ++buff->t.leng;
	    if (count > 0) ++count;
	} else {
	    buff->t.u.text[buff->t.leng - 1] = ' ';
	}
	if (DEBUG_LEXER(2)) {
	    fprintf(dbgout, "forced aborting match by replacing blank at pos %u\n",
		    (int)yyleng - 1);
	}
    }

    if (buff->t.leng < buff->size)     /* for easier debugging - removable */
	Z(buff->t.u.text[buff->t.leng]);  /* for easier debugging - removable */

    if (DEBUG_LEXER(3)) {
	fprintf(dbgout,"---get_decoded_line returning --> %d (current token length %d)\n", count, (int)yyleng);
    }
    return count;
}

static int skip_folded_line(buff_t *buff)
{
    for (;;) {
	int count;
	buff->t.leng = 0;
	count = reader_getline(buff);
	yylineno += 1;
	/* only check for LWSP-char (RFC-822) aka. WSP (RFC-2822),
	 * these only include SP and HTAB */
	if (buff->t.u.text[0] != ' ' &&
	    buff->t.u.text[0] != '\t')
	    return count;
	/* Check for empty line which terminates message header */
	if (is_eol((char *)buff->t.u.text, count))
	    return count;
    }
}

void yyinit(void)
{
    yylineno = 0;

    if ( !msg_count_file)
	lexer = &v3_lexer;
}

int yyinput(byte *buf, size_t size)
/* input getter for the scanner */
{
    int cnt;
    int count = 0;
    buff_t buff;

    buff_init(&buff, buf, 0, (uint) size);

    /* After reading a line of text, check if it has special characters.
     * If not, trim some, but leave enough to match a max length token.
     * Then read more text.  This will ensure that a really long sequence
     * of alphanumerics, which bogofilter will ignore anyway, doesn't crash
     * the flex lexer.
     */

    while ((cnt = get_decoded_line(&buff)) != 0) {
	if (cnt > 0) {
	    count = buff.t.leng;

	    /* Note: some malformed messages can cause xfgetsl() to report
	     ** "Invalid buffer size, exiting."  and then abort.  This
	     ** can happen when the parser is in html mode and there's a
	     ** leading '<' but no closing '>'.
	     **
	     ** The "fix" is to check for a nearly full lexer buffer and
	     ** discard most of it.
	     */

	    if (count >= MAX_TOKEN_LEN * 2 &&
		    long_token(buff.t.u.text, (uint) count)) {
		/* Make sure not to shift bytes outside the buffer */
		if (buff.t.leng >= (uint) count) {
		    uint start = buff.t.leng - count;
		    uint length = count - max_token_len;
		    buff_shift(&buff, start, length);
		}
		count = buff.t.leng;
	    }
	    else
		break;
	}
    }

    if (msg_state &&
	    msg_state->mime_dont_decode &&
	    (msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) {
	assert(size <= INT_MAX && count <= (int)size);
	return (count == EOF ? 0 : count);   /* not decode at all */
    }

#if	defined(CP866) && !defined(ENABLE_ICONV)
    /* EK -  decoding things like &#1084 and charset_table */
    count = decode_and_htmlUNICODE_to_cp866(buf, count);
#endif

    if (replace_nonascii_characters) {
	/* do non-ascii replacement */
	int i;
	for (i = 0; i < count; i++ )
	{
	    byte ch = buf[i];
	    buf[i] = charset_table[ch];
	}
    }

    if (DEBUG_LEXER(2))
	fprintf(dbgout, "*** yyinput(\"%-.*s\", %lu) = %d\n", count, buf, (unsigned long)size, count);

    assert(size <= INT_MAX && count <= (int)size);
    return (count == EOF ? YY_NULL : count);
}

#ifndef DISABLE_UNICODE
static char *charset_as_string(const byte *txt, const size_t len)
{
    static char *charset_text = NULL;
    static unsigned short charset_leng = 0;

    if (charset_text == NULL)
	charset_text = (char *)xmalloc(len+D);
    else {
	if (charset_leng < len) {
	    charset_leng = len;
	    charset_text = (char *)xrealloc(charset_text, charset_leng+D);
	}
    }

    memcpy(charset_text, txt, len);
    Z(charset_text[len]);			/* for easier debugging - removable */

    return charset_text;
}
#endif

word_t *text_decode(word_t *w)
{
    word_t *r = w;
    byte *const beg = w->u.text;		/* base pointer, fixed */
    byte *const fin = beg + w->leng;	/* end+1 position */

    byte *txt = (byte *) memstr(w->u.text, w->leng, "=?");	/* input position */
    uint size = (uint) (txt - beg);				/* output offset */
    const e_enc enc = encoding; /* this helps the clang-13 static analyzer understand encoding is not changing */


    if (txt == NULL)
	return r;

#ifndef	DISABLE_UNICODE
    size_t max = w->leng * 6;
    static buff_t * buf = NULL;

    if (enc == E_UNICODE) {
	if (buf == NULL)
	    buf = buff_new((byte *)xmalloc(max+D), 0, max);
	r = &buf->t;				/* Use buf to return unicode result */

	buf->t.leng = 0;
	if (buf->size < max) {
	    buf->size = max;
	    buf->t.u.text = (byte *) xrealloc(buf->t.u.text, buf->size+D);
	}

	buf->t.leng = size;
	memcpy(buf->t.u.text, beg, size );
	Z(buf->t.u.text[buf->t.leng]);		/* for easier debugging - removable */
    }
#endif

    if (DEBUG_LEXER(2)) {
	fputs("**1**  ", dbgout);
	word_puts(w, 0, dbgout);
	fputs("\n", dbgout);
    }

    while (txt < fin) {
	byte *typ, *tmp, *end;
	uint len;
	bool adjacent;

	if (txt[0] == '=' && txt[1] == '?') {
	    txt += 2;
	} else {
	    len = fin - txt;
#ifndef DISABLE_UNICODE
            assert(buf);
	    memcpy(buf->t.u.text + size, txt, len);
#endif
	    size += len;
	    break;
	}

	if (txt >= fin) break;
	typ = (byte *) memchr((char *)txt+1, '?', fin-txt);	/* Encoding type - 'B' or 'Q' */
	if (!typ) break;

	typ++;
#ifndef DISABLE_UNICODE
	char *charset;
	charset = charset_as_string(txt, typ - txt - 1);
#endif

	tmp = typ + 2;						/* start of encoded word */
	if (tmp >= fin) break;
	end = (byte *) memstr((char *)tmp, fin-tmp, "?=");	/* last byte of encoded word  */
	if (!end) break;

	len = end - tmp;

	w->u.text = tmp;			/* Start of encoded word */
	w->leng = len;				/* Length of encoded word */
	Z(w->u.text[w->leng]);			/* for easier debugging - removable */

	if (DEBUG_LEXER(2)) {
	    fputs("**2**  ", dbgout);
	    word_puts(w, 0, dbgout);
	    fputs("\n", dbgout);
	}

	switch (tolower(*typ)) {		/* ... encoding type */
	case 'b':
	    if (base64_validate(w))
		len = base64_decode(w);		/* decode base64 */
	    break;
	case 'q':
	    if (qp_validate(w, RFC2047))
		len = qp_decode(w, RFC2047);	/* decode quoted-printable */
	    break;
	}

	/* move decoded word to where the encoded used to be */
	if (enc == E_RAW) {
	    memmove(beg+size, w->u.text, len);
	    size += len;			/* bump output pointer */
	    Z(beg[size]);			/* for easier debugging - removable */

	    if (DEBUG_LEXER(3))
		fprintf(dbgout, "**3**  %s\n", beg);
	}

#ifndef	DISABLE_UNICODE
	if (enc == E_UNICODE) {
	    iconv_t iconv_descriptor;
	    buff_t  src;
	    memset(&src, 0, sizeof src);

	    /* convert 'word_t *w' to 'buff_t src' because
	    ** iconvert_cd() needs buff_t pointers
	    */
	    src.t.u.text = w->u.text;
	    src.t.leng = len;
	    src.read   = 0;
	    src.size   = len;

	    bool haveiconvcd = bf_iconv_open(charset_unicode, charset, &iconv_descriptor);
            (void)haveiconvcd; // the only error situation where this is false is if g_iconv_descriptor were NULL. This can't happen through &.

	    iconvert_cd(iconv_descriptor, &src, buf);
	    iconv_close(iconv_descriptor);

	    size = buf->t.leng;

	    if (DEBUG_LEXER(3)) {
		fputs("**4**  ", dbgout);
		word_puts(&buf->t, 0, dbgout);
		fputs("\n", dbgout);
	    }
	}
#endif

	txt = end + 2;	/* skip ?= trailer */
	if (txt >= fin)
	    break;

	/* check for next encoded word */
	end = (byte *) memstr((char *)txt, fin-txt, "=?");
	adjacent = end != NULL;

	/* clear adjacent flag if non-whitespace character found between
	 * adjacent encoded words */
	if (adjacent) {
	    tmp = txt;
	    while (adjacent && tmp < end) {
		if (*tmp && strchr(" \t\r\n", *tmp))
		    tmp += 1;
		else
		    adjacent = false;
	    }
	}

	/* we have a next encoded word and we've had only whitespace
	 * between the current and the next */
	if (adjacent) {
	    /* just skip whitespace */
	    txt = end;
	} else {
	    /* copy everything that was between the encoded words */
	    if (!end) end = fin;
	    while (txt < end) {
		if (enc == E_RAW)
		    beg[size++] = *txt++;
#ifndef	DISABLE_UNICODE
		if (enc == E_UNICODE)
		    buf->t.u.text[buf->t.leng++] = *txt++;
#endif
	    }
	}
    }

    if (enc == E_RAW) {
	r->u.text = beg;
	r->leng = size;
    }

    return r;
}

/*
 * The following sets edit modes for GNU EMACS
 * Local Variables:
 * mode:c
 * End:
 */
