String Sweep in Search of Substrings Ignoring Accent and Case and For Each Match Perform an Action

Question

String Sweep in Search of Substrings Ignoring Accent and Case and For Each Match Perform an Action

Asked 8 years, 4 months ago

Viewed 487 times

5

I need to search in a string a specific substring and for each occurrence found, I must execute an action to replace what was written by the version in bold, as it appears in the list, ignoring accent and case.

Example:

I have the list of items:

Rice
Batata ao Cheff
Macaroni ao Molho
Salmon

With each typed letter suggestions of the list items will appear for the user to select(typeahead), the accent will be ignored and it will be case insensitive.

If the user type "on" or "to", the following items should be suggested:

Potato at the Cheff
Macarrgreat deal at the Sauce
Salmgreat deal

The list is returned according to the rules, but the bold text is due to function highlighter, but when I type "to" it doesn’t ignore the accent and leaves in bold only what is 100% equal to the typed one. In the case of "Macaroni in Sauce", it’s either "Macaronigreat deal to Sauce" or "Pasta at the "Sauce, I need both occurrences.

I believe I need to scan the string in search of this substring and check in all possible accents, if I run a match I need to replace the text ão and ao for great deal and at the, but I can’t reach code level in this solution.

Highlighter:

//Item é o texto da lista, ex: "Macarrão ao Molho"
//Query é o que foi digitado pelo usuário
//Match é a porção do texto que é igual ao que foi digitado pelo usuário
    function Highlighter(item) {
            var query = $("#ReasonDescription").val().replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, '\\$&');
            return item.replace(new RegExp('(' + query + ')', 'ig'), function ($1, match) {
                return '<strong>' + match + '</strong>';
            })
        },

Matcher

function Matcher(value) {
    var accented = {
        'A': '[Aa\xaa\xc0-\xc5\xe0-\xe5\u0100-\u0105\u01cd\u01ce\u0200-\u0203\u0226\u0227\u1d2c\u1d43\u1e00\u1e01\u1e9a\u1ea0-\u1ea3\u2090\u2100\u2101\u213b\u249c\u24b6\u24d0\u3371-\u3374\u3380-\u3384\u3388\u3389\u33a9-\u33af\u33c2\u33ca\u33df\u33ff\uff21\uff41]',
        'B': '[Bb\u1d2e\u1d47\u1e02-\u1e07\u212c\u249d\u24b7\u24d1\u3374\u3385-\u3387\u33c3\u33c8\u33d4\u33dd\uff22\uff42]',
        'C': '[Cc\xc7\xe7\u0106-\u010d\u1d9c\u2100\u2102\u2103\u2105\u2106\u212d\u216d\u217d\u249e\u24b8\u24d2\u3376\u3388\u3389\u339d\u33a0\u33a4\u33c4-\u33c7\uff23\uff43]',
        'D': '[Dd\u010e\u010f\u01c4-\u01c6\u01f1-\u01f3\u1d30\u1d48\u1e0a-\u1e13\u2145\u2146\u216e\u217e\u249f\u24b9\u24d3\u32cf\u3372\u3377-\u3379\u3397\u33ad-\u33af\u33c5\u33c8\uff24\uff44]',
        'E': '[Ee\xc8-\xcb\xe8-\xeb\u0112-\u011b\u0204-\u0207\u0228\u0229\u1d31\u1d49\u1e18-\u1e1b\u1eb8-\u1ebd\u2091\u2121\u212f\u2130\u2147\u24a0\u24ba\u24d4\u3250\u32cd\u32ce\uff25\uff45]',
        'F': '[Ff\u1da0\u1e1e\u1e1f\u2109\u2131\u213b\u24a1\u24bb\u24d5\u338a-\u338c\u3399\ufb00-\ufb04\uff26\uff46]',
        'G': '[Gg\u011c-\u0123\u01e6\u01e7\u01f4\u01f5\u1d33\u1d4d\u1e20\u1e21\u210a\u24a2\u24bc\u24d6\u32cc\u32cd\u3387\u338d-\u338f\u3393\u33ac\u33c6\u33c9\u33d2\u33ff\uff27\uff47]',
        'H': '[Hh\u0124\u0125\u021e\u021f\u02b0\u1d34\u1e22-\u1e2b\u1e96\u210b-\u210e\u24a3\u24bd\u24d7\u32cc\u3371\u3390-\u3394\u33ca\u33cb\u33d7\uff28\uff48]',
        'I': '[Ii\xcc-\xcf\xec-\xef\u0128-\u0130\u0132\u0133\u01cf\u01d0\u0208-\u020b\u1d35\u1d62\u1e2c\u1e2d\u1ec8-\u1ecb\u2071\u2110\u2111\u2139\u2148\u2160-\u2163\u2165-\u2168\u216a\u216b\u2170-\u2173\u2175-\u2178\u217a\u217b\u24a4\u24be\u24d8\u337a\u33cc\u33d5\ufb01\ufb03\uff29\uff49]',
        'J': '[Jj\u0132-\u0135\u01c7-\u01cc\u01f0\u02b2\u1d36\u2149\u24a5\u24bf\u24d9\u2c7c\uff2a\uff4a]',
        'K': '[Kk\u0136\u0137\u01e8\u01e9\u1d37\u1d4f\u1e30-\u1e35\u212a\u24a6\u24c0\u24da\u3384\u3385\u3389\u338f\u3391\u3398\u339e\u33a2\u33a6\u33aa\u33b8\u33be\u33c0\u33c6\u33cd-\u33cf\uff2b\uff4b]',
        'L': '[Ll\u0139-\u0140\u01c7-\u01c9\u02e1\u1d38\u1e36\u1e37\u1e3a-\u1e3d\u2112\u2113\u2121\u216c\u217c\u24a7\u24c1\u24db\u32cf\u3388\u3389\u33d0-\u33d3\u33d5\u33d6\u33ff\ufb02\ufb04\uff2c\uff4c]',
        'M': '[Mm\u1d39\u1d50\u1e3e-\u1e43\u2120\u2122\u2133\u216f\u217f\u24a8\u24c2\u24dc\u3377-\u3379\u3383\u3386\u338e\u3392\u3396\u3399-\u33a8\u33ab\u33b3\u33b7\u33b9\u33bd\u33bf\u33c1\u33c2\u33ce\u33d0\u33d4-\u33d6\u33d8\u33d9\u33de\u33df\uff2d\uff4d]',
        'N': '[Nn\xd1\xf1\u0143-\u0149\u01ca-\u01cc\u01f8\u01f9\u1d3a\u1e44-\u1e4b\u207f\u2115\u2116\u24a9\u24c3\u24dd\u3381\u338b\u339a\u33b1\u33b5\u33bb\u33cc\u33d1\uff2e\uff4e]',
        'O': '[Oo\xba\xd2-\xd6\xf2-\xf6\u014c-\u0151\u01a0\u01a1\u01d1\u01d2\u01ea\u01eb\u020c-\u020f\u022e\u022f\u1d3c\u1d52\u1ecc-\u1ecf\u2092\u2105\u2116\u2134\u24aa\u24c4\u24de\u3375\u33c7\u33d2\u33d6\uff2f\uff4f]',
        'P': '[Pp\u1d3e\u1d56\u1e54-\u1e57\u2119\u24ab\u24c5\u24df\u3250\u3371\u3376\u3380\u338a\u33a9-\u33ac\u33b0\u33b4\u33ba\u33cb\u33d7-\u33da\uff30\uff50]',
        'Q': '[Qq\u211a\u24ac\u24c6\u24e0\u33c3\uff31\uff51]',
        'R': '[Rr\u0154-\u0159\u0210-\u0213\u02b3\u1d3f\u1d63\u1e58-\u1e5b\u1e5e\u1e5f\u20a8\u211b-\u211d\u24ad\u24c7\u24e1\u32cd\u3374\u33ad-\u33af\u33da\u33db\uff32\uff52]',
        'S': '[Ss\u015a-\u0161\u017f\u0218\u0219\u02e2\u1e60-\u1e63\u20a8\u2101\u2120\u24ae\u24c8\u24e2\u33a7\u33a8\u33ae-\u33b3\u33db\u33dc\ufb06\uff33\uff53]',
        'T': '[Tt\u0162-\u0165\u021a\u021b\u1d40\u1d57\u1e6a-\u1e71\u1e97\u2121\u2122\u24af\u24c9\u24e3\u3250\u32cf\u3394\u33cf\ufb05\ufb06\uff34\uff54]',
        'U': '[Uu\xd9-\xdc\xf9-\xfc\u0168-\u0173\u01af\u01b0\u01d3\u01d4\u0214-\u0217\u1d41\u1d58\u1d64\u1e72-\u1e77\u1ee4-\u1ee7\u2106\u24b0\u24ca\u24e4\u3373\u337a\uff35\uff55]',
        'V': '[Vv\u1d5b\u1d65\u1e7c-\u1e7f\u2163-\u2167\u2173-\u2177\u24b1\u24cb\u24e5\u2c7d\u32ce\u3375\u33b4-\u33b9\u33dc\u33de\uff36\uff56]',
        'W': '[Ww\u0174\u0175\u02b7\u1d42\u1e80-\u1e89\u1e98\u24b2\u24cc\u24e6\u33ba-\u33bf\u33dd\uff37\uff57]',
        'X': '[Xx\u02e3\u1e8a-\u1e8d\u2093\u213b\u2168-\u216b\u2178-\u217b\u24b3\u24cd\u24e7\u33d3\uff38\uff58]',
        'Y': '[Yy\xdd\xfd\xff\u0176-\u0178\u0232\u0233\u02b8\u1e8e\u1e8f\u1e99\u1ef2-\u1ef9\u24b4\u24ce\u24e8\u33c9\uff39\uff59]',
        'Z': '[Zz\u0179-\u017e\u01f1-\u01f3\u1dbb\u1e90-\u1e95\u2124\u2128\u24b5\u24cf\u24e9\u3390-\u3394\uff3a\uff5a]'
    };

    var pattern = function (query) {
        var words = query.replace(/([|()[{.+*?^$\\])/g, '\\$1').split(/\s+/);

        words.sort(function (a, b) {
            return b.length - a.length;
        });

        for (var i = 0; i < words.length; i++) {
            words[i] = words[i].replace(/\S/g, function (character) {
                return accented[character.toUpperCase()] || character;
            });
        }
        return new RegExp(words.join("|"), 'g');
    };

    return value.match(pattern(value));
}

Exactly that. I need to call replace to replace "on" and "on" with its bold versions.

– LP. Gonçalves

2017/11/14 at 19:20
I made this little code that who knows can give some idea: https://jsfiddle.net/h1gpea5v/

– Sam

2017/11/14 at 23:32

2 answers

Browser other questions tagged javascript jquery string regex substring

You are not signed in. Login or sign up in order to post.

by hkotsubo • **55,826** points · Answer 1 · 2021-07-16T15:05:45+00:00

The basic idea is to take what was typed and do the search disregarding accents and differences between upper and lower case. Then, for each occurrence that is found, you break the text between before and after the searched snippet, in addition to the snippet itself (within the tag strong).

Follow the code, and further down, the - long - explanation:

const itens = [ 'Arroz', 'Batata ao Cheff', 'Macarrão ao Molho', 'Salmão' ];

function buscarItens(e) {
    const res = document.querySelector('#resultado ul');
    res.innerHTML = ''; // limpa os resultados

    const busca = e.target.value.normalize('NFD').replace(/\p{M}/ug, ''); // o que foi digitado e deve ser buscado (removendo os acentos)
    const regex = new RegExp(busca.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'ig');
    for (const item of itens) { // para cada item do menu
        const text = item.normalize('NFD').replace(/\p{M}/ug, '');
        let i = 0;
        const li = document.createElement('li'); // cria o elemento que terá o texto com highlight
        for (const match of text.matchAll(regex)) { // verificar todas as ocorrências da busca
            li.appendChild(document.createTextNode(item.slice(i, match.index))); // texto antes da palavra encontrada

            i = match.index + busca.length;
            let strong = document.createElement('strong');
            strong.appendChild(document.createTextNode(item.slice(match.index, i))); // palavra destacada (a "busca")
            li.appendChild(strong);
        }
        li.appendChild(document.createTextNode(item.slice(i))); // texto depois da palavra
        if (li.childElementCount != 0) { // se encontrou algo, adiciona nos resultados
            res.appendChild(li);
        }
    }
}

// cada vez que digitar algo no input, faz a busca
document.querySelector('#busca').addEventListener('input', buscarItens);

<input type="text" id="busca">

<div id="resultado">
    <ul></ul>
</div>

Remove accents

First I take what was typed and remove the accents:

const busca = e.target.value.normalize('NFD').replace(/\p{M}/ug, '');

The operation of the method normalize is explained in detail here, but to summarize, a sharp letter, like the á (letter a with acute accent), according to Unicode, can be represented in two ways:

composite - like code point U+00E1 (LATIN SMALL LETTER A WITH ACUTE) (á)
decomposed - as a combination of two code points (in this order):
- the letter "a" (without accent): U+0061 (LATIN SMALL LETTER A)
- the acute accent: U+0301 (COMBINING ACUTE ACCENT)

^{And to understand what a code point is, read here.}

The first form is called NFC, and the second, NFD. Therefore, the above code first normalizes the string to NFD, "breaking" the accents into two characters: the letter without accent and the accent itself.

Then the replace uses the Unicode Property Escape \p{M}, which takes all characters from the "Mark" categories (they all start with "M" from this list). All the accents fit this category, and how do I do the replace for '' (empty string), in practice it is the same as removing the accents. That is, if the string is ão, she becomes ao.

It is worth remembering that for the Unicode Property Escape work, needs the flag u (is the letter u after the bars, see here for more details), and currently already has a good support of most browsers. I also use the flag g to replace all occurrences (without this flag, it only replaces the first).

Build regex for search

Next stretch:

const regex = new RegExp(busca.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'ig');

In the search string (which already had the accents removed in the previous step), I do replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'). This serves to sanitize the string, since it will be used later to construct a regex (regular expression). And as this regex will be built from something typed by the user, one should beware of the meta-characters (characters that have special meaning and need to be escaped with \). This replace is explained in more detail here (and this link also has other points that apply to your case, about building regex from user entries - I suggest you read).

Finally, I create the regex using the RegExp, which receives the string with the escaped meta-characters and the flags i (case insensitive) and g (searches for all occurrences).

The search

In the search, I take the item name and also remove the accents, in the same way done with the search string:

const text = item.normalize('NFD').replace(/\p{M}/ug, '');

Then I go through all the pouch that the regex find (for (const match of text.matchAll(regex))), and for each found snippet I picked up the text that is before it, the very snippet found (putting it inside the tag strong) and the passage that comes after.

Remember that if more than one occurrence is found, they are all placed inside a tag strong. For example:

Notice that I typed "No," and in the case of "Pasta in Sauce," the result was "Macargreat deal at the Sauce" (both "o" and "o") were exchanged.

If you just want to change an occurrence, it would look different. You would need to remove the flag g and trade matchAll for match (and not use a for, since the return is only a result):

const itens = [ 'Arroz', 'Batata ao Cheff', 'Macarrão ao Molho', 'Salmão' ];

function buscarItens(e) {
    const res = document.querySelector('#resultado ul');
    res.innerHTML = ''; // limpa os resultados

    const busca = e.target.value.normalize('NFD').replace(/\p{M}/ug, ''); // o que foi digitado e deve ser buscado (removendo os acentos)

    // *** Aqui: não tem mais a flag g ***
    const regex = new RegExp(busca.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'i');

    for (const item of itens) { // para cada item do menu
        const text = item.normalize('NFD').replace(/\p{M}/ug, '');
        const li = document.createElement('li'); // cria o elemento que terá o texto com highlight
        // *** Aqui: trocar matchAll por match e não usar for ***
        const match = text.match(regex);
        if (match) {
            li.appendChild(document.createTextNode(item.slice(0, match.index))); // texto antes da palavra encontrada

            const i = match.index + busca.length;
            let strong = document.createElement('strong');
            strong.appendChild(document.createTextNode(item.slice(match.index, i))); // palavra destacada (a "busca")
            li.appendChild(strong);

            li.appendChild(document.createTextNode(item.slice(i))); // texto depois da palavra
            res.appendChild(li);
        }
    }
}

// cada vez que digitar algo no input, faz a busca
document.querySelector('#busca').addEventListener('input', buscarItens);

<input type="text" id="busca">

<div id="resultado">
    <ul></ul>
</div>

Another point is that no HTML details were given, so I assumed it would be a list (ul), so each search result was in a li. Anyway, it is not difficult to adapt to other cases, because the logic would be the same (text before, search text within strong, text after).

by Marcelo Junior • **816** points · Answer 2 · 2017-11-14T21:08:57+00:00

The Idea

To catch all the substring of a string that respect a certain pattern and insert the tag  among the indices that appear such standard, where default is not case sensitive and ignores accentuation, we must transform the content of this string and then find the indices that match the pattern and then insert  amid índice and indice+size_padrao.

How have you managed to find the pattern and insert the tag , I’ll explain how I managed to leave the string in low box and remove its accentuation

Leaving in low box, and removing the accent

To turn a string into a low box, it’s simple, we just have to use the function toLowerCase(). Now all that remains is to remove the accent.

In Portuguese we have accents in letters: a, e, i, o, u, c. So if we create a regular expression to find each of these accented letters, and replace it with the accent-less letter, then we’ll have the string unaccustomed.

// Regex para pegar acentos
var ACENTO_REGEX = {
  'a': /[\xE0-\xE6]/g,
  'e': /[\xE8-\xEB]/g,
  'i': /[\xEC-\xEF]/g,
  'o': /[\xF2-\xF6]/g,
  'u': /[\xF9-\xFC]/g,
  'c': /\xE7/g
}

// Retira acento e deixa em minúsculo
function padronizaStr(str) {
  // Deixa em minúsculo
  str = str.toLowerCase();

  // Retira os acentos
  for(var re in ACENTO_REGEX) {
    str = str.replace(ACENTO_REGEX[re], re)
  }

  return str;
}

Example

I created a list to demonstrate what it would be like, but you can change and put it the way that solves your problem using the same idea.

OBS.: I used a variable called padrao as if it already had the value of the field in it, just as an example.

// Regex para pegar acentos
var ACENTO_REGEX = {
  'a': /[\xE0-\xE6]/g,
  'e': /[\xE8-\xEB]/g,
  'i': /[\xEC-\xEF]/g,
  'o': /[\xF2-\xF6]/g,
  'u': /[\xF9-\xFC]/g,
  'c': /\xE7/g
}

// Tamanho de <strong></strong>
var TAG_SIZE = '<strong></strong>'.length;

// Busca os valores em <li>, a partir de <ul>
function liValues(ulNodes) {
  var liArr = [];
  
  ulNodes.forEach(function(li) { 
    if(li.innerText)
      liArr.push(li.innerText);
  });
  
  return liArr;
}

// Retira acento e deixa em minúsculo
function padronizaStr(str) {
  // Deixa em minúsculo
  str = str.toLowerCase();
  
  // Retira os acentos
  for(var re in ACENTO_REGEX) {
    str = str.replace(ACENTO_REGEX[re], re)
  }
  
  return str;
}

// Altera a lista
function alteraLista(ul, arrLi) {
  // Remove todos os <li> de <ul>
  ul.empty();
  
  // Adiciona novos <li>
  arrLi.forEach(function(li) {
    ul.append(`<li>${li}</li>`);
  });
}

function aplicaAlteracao() {
  var ulNodes = $('ul')[0].childNodes;
  var liArr = liValues(ulNodes);
  
  var padrao = "ao";
  padrao = padronizaStr(padrao);
  
  liArr = liArr.map(function(str) {
    // Vamos usar li, para não perdermos o valor de str
    li = str;
    
    // Deixa todos os valores com o padrão
    // Minúsculo e sem acentos
    li = padronizaStr(li);
    
    // Acha os indices que dão Match com o padrão
    var idxs = [];
    var re = new RegExp('\(' + padrao + ')', 'ig');
    
    while ((match = re.exec(li)) != null) {
      idxs.push(match.index);
    }
    
    idxs.forEach(function(val, idx) {
      var arr = str.split('');
      
      // Inserindo <strong></strong>
      arr.splice(val+(idx*TAG_SIZE), 0, '<strong>');
      arr.splice(val+padrao.length+1+(idx*TAG_SIZE), 0, '</strong>') 

      str = arr.join('');
    });
    
    return str;
  });
  
  alteraLista($('ul'), liArr)
}

<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>

<ul>
  <li>Arroz</li>
  <li>Batata ao Cheff</li>
  <li>Macarrão ao Molho</li>
  <li>Salmão</li>
</ul>

<button onclick="aplicaAlteracao()">
  Altera!
</button>