Convert string with HTML tags into an array

Asked

Viewed 1,422 times

5

Consider the following string:

var texto = 'Esse é um texto de <span class="red"> teste </span>';

I need to transform the string into an array separating by space, ie:

var palavras = texto.split(" ");

The problem is that the text contains HTML and in that case the resulting array will be:

palavras[0] = 'Esse';
palavras[1] = 'é';
palavras[2] = 'um';
palavras[3] = 'texto';
palavras[4] = 'de';
palavras[5] = '<span';
palavras[6] = 'class="red">';
palavras[7] = 'teste';
palavras[8] = '</span>';

However I need the resulting array to be the following:

palavras[0] = 'Esse';
palavras[1] = 'é';
palavras[2] = 'um';
palavras[3] = 'texto';
palavras[4] = 'de';
palavras[5] = '<span class="red"> teste </span>';

How to do this using javascript?

  • Using regular expression, you have to divide by spaces except within tags

  • @Guilhermecostamilam the problem is creating this regular expression...

3 answers

4

You can use DOMParser to make the HTML text Parsing. From there, just manipulate the HTML to get the elements you need:

// parsing do trecho HTML
var texto = 'Esse é um texto de <span class="red"> teste </span>';
var parser = new DOMParser();
// cria um document com html, header, body, etc
var htmlDoc = parser.parseFromString(texto, "text/html");

// obter o body do HTML
var body = htmlDoc.querySelector('body');

// obter o elemento span
var span = body.querySelector('span');
// remover o span para que sobre só o texto
body.removeChild(span);
// quebrar o texto em um array
var palavras = body.innerHTML.trim().split(' ');
// adicionar o span no array
palavras.push(span.outerHTML);

console.log(palavras);

The code is very specific to the text you have placed. If you have other tags in other positions, obviously appropriate adjustments should be made.


It is also possible to use the function parseHTML jQuery. The idea is the same: do the Parsing and extract the elements you need.

var texto = 'Esse é um texto de <span class="red"> teste </span>';
var html = $.parseHTML(texto);

var palavras;
$.each(html, function (i, el) {
    if (el.nodeName === '#text') {
        palavras = el.nodeValue.trim().split(' ');
    } else if (el.nodeName === 'SPAN') {
        palavras.push(el.outerHTML);
    }
});

console.log(palavras);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>

Again, the code is very specific to your case as it expects a text followed by a span. Adapt to other cases if necessary.


Regex, although very cool, is not always the best solution, mainly for HTML Parsing. If there are already specific parsers for the type of data you are manipulating, it is preferable to use them.

4

With the same idea presented by hkotsubo, you can insert the contents of your string in a virtual element and scroll through the child elements. If it is an instance of Text, effectuates the split, otherwise displays the content itself.

function* get_elements(text) {
  const aux = document.createElement('div');
  aux.innerHTML = text;
  
  for (let element of aux.childNodes) {
    if (element instanceof Text) {
      yield* element.data.split(' ').filter(it => it);
    } else if (element instanceof HTMLElement) {
      yield element.outerHTML;
    }
  }
}

const elements = get_elements('Esse é um texto de <span class="red"> teste </span>')

console.log([...elements])


An analogous solution for ES5 would be:

function get_elements(text) {
  var result = [];  // criado um array para ser retornado
  var i;
  var aux = document.createElement('div');  // const virou var
  
  aux.innerHTML = text;
  
  for (i = 0; i < aux.childNodes.length; i++) {  // for..of virou for "clássico"
    if (aux.childNodes[i] instanceof Text) {  // element virou aux.childNodes[i]
      result = result.concat(  // coloca os pedaços do texto no resultado
        aux.childNodes[i].data.split(' ')
          .filter(
            function (it) {  // arrow function virou função anônima
              return it;
            }
          )
      );
    } else if (aux.childNodes[i] instanceof HTMLElement) {
      result.push(aux.childNodes[i].outerHTML);  // adiciona o html no resultado
    }
  }
  
  return result; // retorna o array final
}

var elements = get_elements('Esse é um texto de <span class="red"> teste </span>')

console.log(elements);

The idea is exactly the same, but instead of returning a generator, a array, as well as the bow for..of was replaced by for "classic" and the Arrow Function in filter replaced by an anonymous function.

  • 1

    Could give a solution with ES5? Nothing against ES6, but even that people scream and salute the ES6, IE11 and ancient Androids still live.

  • 2

    @Guillhermenascimento made.

  • 1

    Show! And still with explanation about the differences +1

0

I managed to solve the problem. I don’t know if it is the best approach but it is a solution that works. I await suggestions.

Before separating by space, I stored all tags and their contents in an array and replaced each position with the respective index:

var texto = 'Esse é um texto de <span class="red"> teste </span>';

//verifica se existe alguma tag HTML
//output: ['<span class="red"> teste </span>'];
var tags = texto.match(/<(.*?)>.*?<\/(.*?)>/g); 

//se houver alguma tag, substitui pelo index correspondente
if(tags) {
    tags.forEach(function(tag, index) {
      texto = texto.replace(tag, "{"+index+"}");
    });
}

The result of the string before separating by space will be:

texto = 'Esse é um texto de {0}';
var palavras = texto.split(" ");

The result after sorting by space will be the following array:

palavras[0] = 'Esse';
palavras[1] = 'é';
palavras[2] = 'um';
palavras[3] = 'texto';
palavras[4] = 'de';
palavras[5] = '{0}';

Now just replace the element 5 that has an index by its tag:

palavras.forEach(function(palavra, index) {
    //verifica se o elemento armazena o index correpondente a uma tag
    //e substitui o conteúdo pela tag
    if(palavra.match(/{[0-9]*?}/g))
        palavras[index] = tags[parseInt(palavra.replace(/{|}/g, ""))];
});

The result then will be:

palavras[0] = 'Esse';
palavras[1] = 'é';
palavras[2] = 'um';
palavras[3] = 'texto';
palavras[4] = 'de';
palavras[5] = '<span class="red"> teste </span>';
  • Why use * followed by ?, the first is already none or many, would not be redundant?

  • 1

    @Guilhermecostamilam In fact the ?, when used shortly after * or +, has another meaning. See examples where it makes a difference to use * and *? here and here. In the above case, it is interesting to use for case you have more than one tag, for example

  • 1

    @hkotsubo did not know this Lazy quantifier, would have helped me a few weeks ago, thanks

Browser other questions tagged

You are not signed in. Login or sign up in order to post.