import { Monologue } from '../../../entities/v1/interview_intelligence/Monologue';
import Tokenizer from '../../../vendor/tokenize-text/index';

// This tokenizer must work exactly like ASR's: https://www.nltk.org/_modules/nltk/tokenize/casual.html
const tokenize = new Tokenizer().re(
  /(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])|(?:[+-]?\d+[,/.:-]\d+[+-]?)|(?:[\w_]+)|(?:\.(?:\s*\.){1,})|(?:\S)/,
);

function normalize(str) {
  return str.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}

function escapeRegExp(string) {
  return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

function findMatches(text, findString) {
  findString = escapeRegExp(findString).split(' ').join('[ .?,!]*');
  const regex = new RegExp(findString, 'ig');

  const matches = [];
  for (let match = regex.exec(text); match !== null; match = regex.exec(text)) {
    matches.push([match.index, regex.lastIndex - 1]);
  }

  return matches;
}

export function markTokens(monologue: Monologue, findString: string) {
  findString = normalize(findString.trim());

  if (!findString || findString === '') {
    return Array(monologue.tokens.length).fill([]);
  }

  const sentence = normalize(monologue.sentence);
  const matches = findMatches(sentence, findString);
  const tokens = tokenize(sentence);

  const tokenMarks = [];
  for (let i = 0; i < tokens.length; ++i) {
    const marks = [];
    const token = tokens[i];
    const [tokenStart, tokenEnd] = [token.index, token.index + token.offset];

    for (const match of matches) {
      const matchStart = match[0];
      const matchEnd = match[1] + 1;

      const mark = {
        start: Math.max(matchStart - tokenStart, 0),
        end: Math.min(
          matchEnd - tokenEnd + token.value.length,
          token.value.length,
        ),
        type: 'NONE',
      };

      if (
        matchStart >= tokenStart &&
        matchStart <= tokenEnd &&
        matchEnd >= tokenEnd
      ) {
        mark.type = 'START';
      }

      if (matchStart < tokenStart && matchEnd > tokenEnd) {
        mark.type = 'MIDDLE';
      }

      if (
        matchStart < tokenStart &&
        matchEnd >= tokenStart &&
        matchEnd <= tokenEnd
      ) {
        mark.type = 'END';
      }

      if (matchStart >= tokenStart && matchEnd <= tokenEnd) {
        mark.type = 'SINGLE';
      }

      if (mark.type !== 'NONE' && mark.start !== mark.end) marks.push(mark);
    }

    tokenMarks.push(marks);
  }

  return tokenMarks;
}
