How to identify a number (in full) in a sentence

Asked

Viewed 617 times

4

I need a script to identify a number in a sentence. Ex:

"Two Weeks ago" -> number = 2

It can even be by substitution even. So:

if str contain two, replace("two", 2)

Is there a module that can do this? I’ve looked, but only found some that do the opposite ( number -> Extended )

2 answers

5


Thiago, what I got was a Python function passed to javascript, as you can see in that question. But it’s quite functional:

var Small = {
  'zero': 0,
  'one': 1,
  'two': 2,
  'three': 3,
  'four': 4,
  'five': 5,
  'six': 6,
  'seven': 7,
  'eight': 8,
  'nine': 9,
  'ten': 10,
  'eleven': 11,
  'twelve': 12,
  'thirteen': 13,
  'fourteen': 14,
  'fifteen': 15,
  'sixteen': 16,
  'seventeen': 17,
  'eighteen': 18,
  'nineteen': 19,
  'twenty': 20,
  'thirty': 30,
  'forty': 40,
  'fifty': 50,
  'sixty': 60,
  'seventy': 70,
  'eighty': 80,
  'ninety': 90
};

var Magnitude = {
  'thousand': 1000,
  'million': 1000000,
  'billion': 1000000000,
  'trillion': 1000000000000,
  'quadrillion': 1000000000000000,
  'quintillion': 1000000000000000000,
  'sextillion': 1000000000000000000000,
  'septillion': 1000000000000000000000000,
  'octillion': 1000000000000000000000000000,
  'nonillion': 1000000000000000000000000000000,
  'decillion': 1000000000000000000000000000000000,
};

var a, n, g;

function text2num(s) {
  a = s.toString().split(/[\s-]+/);
  n = 0;
  g = 0;
  a.forEach(feach);
  return g + n;
}
function feach(w) {
  var x = Small[w];
  if (x != null) {
    g = g + x;
  } else if (w == "hundred") {
    g = g * 100;
  } else if (w == "and") {
    return;
  } else {
    x = Magnitude[w];
    if (x != null) {
      n = n + g * x
      g = 0;
    }
  }
}

document.body.innerHTML += text2num('five billion two million one hundred and fourteen thousand and sixty-nine');

The most "complicated" question, in this case, is substitution, because capturing the sequence of words that may or may not correspond to a number would be a little more laborious. But you can also use a Marker. In case I put $().

In the following example, I use the one and in two which would clearly not be necessary since they are small numbers, but in the case of "five Billion two Million one Hundred and fourteen Thousand and Sixty-Nine", it would be difficult to recognize it in other words.

var Small = {
  'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90
};

var Magnitude = {
  'thousand': 1000, 'million': 1000000, 'billion': 1000000000, 'trillion': 1000000000000, 'quadrillion': 1000000000000000, 'quintillion': 1000000000000000000, 'sextillion': 1000000000000000000000, 'septillion': 1000000000000000000000000, 'octillion': 1000000000000000000000000000, 'nonillion': 1000000000000000000000000000000, 'decillion': 1000000000000000000000000000000000,
};

var a, n, g;

function text2num(s) {
  a = s.toString().split(/[\s-]+/);
  n = 0;
  g = 0;
  a.forEach(feach);
  return g + n;
}

function feach(w) {
  var x = Small[w];
  if (x != null) {
    g = g + x;
  } else if (w == "hundred") {
    g = g * 100;
  } else if (w == "and") {
    return;
  } else {
    x = Magnitude[w];
    if (x != null) {
      n = n + g * x
      g = 0;
    }
  }
}

var regex = /\$\([^$]+\)/g;
var text = document.querySelectorAll('p')[0].innerHTML;

text.match(regex).forEach(function(el, i) {
	el = el.replace(/(\(|\)|\$)/g, "");
  var re = new RegExp('\\$\\('+el+'\\)', 'g');
  text = text.replace(re, text2num(el))
})

document.body.innerHTML += "<br><br>";
document.body.innerHTML += text;
<p>No $(two) objects can occupy the same place at $(one) time
  <p/>

Version without the Marker

I think I’ve found a way to recognize the number in other words, without the need for previous markers, I ask you to look for possible mistakes. Take a look, I created the function "replaceToNum()", which takes any string as argument and returns the same string, but with the numbers in full converted into numeric digits:

function replaceToNum(text) {
  var Small = { 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90 };
  var Magnitude = { 'thousand': 1000, 'million': 1000000, 'billion': 1000000000, 'trillion': 1000000000000, 'quadrillion': 1000000000000000, 'quintillion': 1000000000000000000, 'sextillion': 1000000000000000000000, 'septillion': 1000000000000000000000000, 'octillion': 1000000000000000000000000000, 'nonillion': 1000000000000000000000000000000, 'decillion': 1000000000000000000000000000000000 };
  var a, n, g;
  function text2num(s) {
    a = s.toString().split(/[\s-]+/); n = 0; g = 0;
    a.forEach(feach);
    return g + n;
  }
  function feach(w) {
    var x = Small[w];
    if (x != null) {
      g = g + x;
    } else if (w == "hundred") {
      g = g * 100;
    } else if (w == "and") {
      return;
    } else {
      x = Magnitude[w];
      if (x != null) {
        n = n + g * x
        g = 0;
      }
    }
  }
  var words = "(" + Object.keys(Magnitude).join('|') + "|and|hundred|" + Object.keys(Small).join('|') + ")+( |-)*" + "(" + Object.keys(Magnitude).join('|') + "|and|-|hundred| |" + Object.keys(Small).join('|') + ")*( |[.,^~`´])";
  var regex = new RegExp(words, 'gim');
  text = text.toString() + " ";
  text.match(regex).forEach(function(el, i) {
    text = text.replace(el.replace(/\s$/g, '').replace(/[.,^~`´]/g, ''), text2num(el.replace(/[.,^~`´]/g, '').toLowerCase()));
  })
  return text;
}
var text = "No two objects can occupy the same place one time. Bla bla bla five billion Two Million one hundred and fourteen thousand and sixty-nine. Four. Minha criatividade tá ZeRo";

document.body.innerHTML += "<br><br>";
document.body.innerHTML += replaceToNum(text);

  • Samir, I tested your solution (https://jsfiddle.net/hdtqc12u/1/) and noticed that it does not accept large letters in the numbers nor more than one occurrence per string. I don’t know if it’s important to AP. Otherwise ++

  • @Sergio, I think the problem is solved. Thank you for the addendum.

  • Personal thank you!!

3

A while ago I worked on this idea in a project where I needed to filter out ages of clients in a <input>. Then I ended up changing the approach and creating buttons to receive numbers without having to filter them. But the idea was this:

var numbers = {
    en: ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred'],
    se: ['noll', 'ett', 'två', 'tre', 'fyra', 'fem', 'sex', 'sju', 'åtta', 'nio', 'tio', 'elva', 'tolv', 'tretton', 'fjorton', 'femton', 'sexton', 'sjutton', 'arton', 'nitton', 'tjugo', "trettio", "fyrtio", "femtio", "sextio", "sjuttio", "åttio", "nittio", "hundra"],
    de: ["null", "eins", "zwei", "drei", "vier", "fünf", "sechs", "sieben", "acht", "neun", "zehn", "elf", "zwölf", "dreizehn", "vierzehn", "fünfzehn", "sechzehn", "siebzehn", "achtzehn", "neunzehn", "zwanzig", "dreißig", "vierzig", "fünfzig", "sechzig", "siebzig", "achtzig", "neunzig", "hundert"],
    no: ["null", "en", "to", "tre", "fire", "fem", "seks", "sju", "åtte", "ni", "ti", "elleve", "tolv", "tretten", "fjorten", "femten", "seksten", "sytten", "atten", "nitten", "tyve", "tretti", "førti", "femti", "seksti", "sytti", "åtti", "nitti", "hundre"]
};
var mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90, 100];

function findNumber(string) {
    var results = [];
    Object.keys(numbers).forEach(function(lang) {
        var nrs = numbers[lang];
        nrs.forEach(function(nr, i) {
            var rexep = new RegExp('[^\\w\\-]' + nr + '[^\\w\\-]|^' + nr + '[^\\w\\-]|[^\\w\\-]' + nr + '$|^' + nr + '$', 'gi');
            var match = string.match(rexep);
            if (match) {
                var mappedNumber = mapper[i];
                if (mappedNumber > 19) {
                    var customRegexp = new RegExp(match[0].trim() + '\\s\\w+', 'i');
                    var wholeNumber = string.match(customRegexp);
                    if (wholeNumber) mappedNumber = mappedNumber + results.pop();
                }
                results = results.concat(mappedNumber)
            }
        });
    });
    var more = string.match(/\d+/g) || [];
    return results.concat(more).map(Number);
};

Basically this function compares text from different languages and extracts numbers. An example would be:

var testes = ['TWO WEEKS AGO', 'two weeks ago', 'I am thirty seven years old!', 'I was the the number five in my class! Now I am number one!'];
console.log(JSON.stringify(testes.map(findNumber))); // [[2],[2],[37],[1,5]]

I didn’t go through with it then. I might serve you as you are, things I remember I wanted to do:

  • respect the order found in the result
  • allow orders greater than 100 (the logic could be with Math.floor(nr / 100) and search for the N following words)
  • filter the language first to avoid cases where words from other languages match.

If you want to use can see a live example here: https://jsfiddle.net/hdtqc12u/

  • Great option. Make the language filter would be very interesting rsrs.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.