How to delete text in parentheses with regexp?

Asked

Viewed 2,070 times

3

I am developing an HTML5 application that uses the Wikipedia API to give the definition of what we ask for. I used the code that they made available in a forum of another language of Stack and, in it, I noticed the presence of the use of regular expressions to eliminate certain parts of the extracted text (links, references, tags, etc.).

However, I would also like to remove texts between parentheses as in the example below:

Whatever stayed that way

Application software (application or application) is a computer program that aims to help its user perform...

Would look like this

Application software is a computer program that aims to help its user perform...

Well, I have the code below and I would like a help on the part of regular expressions (I haven’t learned yet, but I’m trying) to solve my problem:

if(d.getElementById('q').value !== "") {
algo = d.getElementById('q').value
}
var definir = function(algo) {
startFetch(algo, 1, 1000);
d.getElementById("feedback").value = "Definir > " + algo; // Palavra ou sentença a ser definida
d.getElementById('q').value = algo;
search();
d.getElementById("resposta").value = "Só um momento...";
voz();
}

var textbox = d.getElementById("resposta");
var tempscript = null, minchars, maxchars, attempts;

function startFetch(algo, minimumCharacters, maximumCharacters, isRetry) {
if (tempscript) return;
if (!isRetry) {attempts = 0;
minchars = minimumCharacters;
maxchars = maximumCharacters;
}

tempscript = d.createElement("script");
tempscript.type = "text/javascript";
tempscript.id = "tempscript";
tempscript.src = "https://pt.wikipedia.org/w/api.php?action=query&titles="
+ algo // Palavra ou sentença a ser definida
+ "&redirects="
+ "&prop=extracts"
+ "&exchars="
+ maxchars // Máximo de caracteres a ser "puxado"
+ "&exintro"
+ "&format=json"
+ "&callback=onFetchComplete"
+ "&requestid="
+ Math.floor(Math.random()*999999).toString();
d.body.appendChild(tempscript);
}

function onFetchComplete(data, algo) {
d.body.removeChild(tempscript);
tempscript = null
var s = getFirstProp(data.query.pages).extract;
s = htmlDecode(stripTags(s));
if (s.length > minchars || attempts++ > 5) {
d.getElementById("resposta").value = s;
d.getElementById("feedback").value = "Definindo...";
voz(); // Lê em voz alta a definição da palavra ou sentença
espera(); // Delay para se aproximar do carregamento do áudio
d.getElementById("log").value += "Definir > "
+ algo // Palavra ou sentença a ser definida
+ "\n"
+ s // Definição carregada
+ "\n\n";
saveHist(); // Salva a definição no log de conversa
}

else {
d.getElementById('resposta').value = "Não encontrei a definição, "+nomeDoUsuario+".";
voz();
}
}
function getFirstProp(obj) {
for (var i in obj) return obj[i];
}
function stripTags(s) {

// Abaixo está meu problema ***************************************************
return s.replace(/<\w+(\s+("[^"]*"|'[^']*'|[^>])+)?>|<\/\w+>/gi, "");
// Acima está meu problema ****************************************************

}
function htmlDecode(input){
var e = document.createElement("div"); e.innerHTML = input; return e.childNodes.length === 0 ? "" : e.childNodes[0].nodeValue;
}

1 answer

5


I see in your regex you already have other cases you want to treat. I suggest you join (\s\(.*?\))| at the beginning of your regex.

I usually use this site to work with regex, it is very useful.

Anyway what this suggestion does is:

The outer parenthesis creates a capture group (...). After opening this capture group I joined "a blank" which is written with \s. This is because before a parenthesis there is always a blank space. In a regex when using the slider \ this escapes the next character. As you saw before the parentheses have a function in a regex (capture group), the bar \( "off" this function and points to the character itself (.

Then use .*? which means any character, zero or more times, except new lines. And I end with | which is the "or" of the regex and which means caso A | caso B | caso C (CASE THE or case B, etc.)

var s = 'Software aplicativo (aplicativo ou aplicação) é um programa de computador que tem por objetivo ajudar o seu usuário a desempenhar..';
document.body.innerHTML = s.replace(/(\s\(.*?\))|<\w+(\s+("[^"]*"|'[^']*'|[^>])+)?>|<\/\w+>/gi, '');

  • This is exactly what I needed. A very well explained answer, I learned here what I have been trying to learn for a long time. Thank you very much, @Sergio.

  • 1

    @Great winner! I’m glad I helped. I voted +1 in your question also for giving an example of what you were looking for.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.