sentence_parser.js

Jump To … +

abbreviations.js adjectives.js convertables.js dates.js demonyms.js firstnames.js honourifics.js irregular_nouns.js irregular_verbs.js misc.js multiples.js numbers.js organisations.js phrasal_verbs.js places.js uncountables.js verbs.js fns.js index.js lexicon.js negate.js passive_voice.js contractions.js fancy_lumping.js grammar_rules.js parts_of_speech.js phrasal_verbs.js tagger.js word_rules.js question.js sentence.js statement.js tense.js adjective.js to_adverb.js to_comparative.js to_noun.js to_superlative.js adverb.js to_adjective.js is_acronym.js article.js date.js date_rules.js is_date.js parse_date.js is_plural.js is_uncountable.js noun.js is_organisation.js organisation.js gender.js is_person.js parse_name.js person.js is_place.js place.js pluralize.js pronoun.js singularize.js is_value.js numbers.js to_number.js units.js value.js term.js conjugate.js from_infinitive.js predict_form.js suffix_rules.js to_actor.js to_infinitive.js negate.js verb.js sentence_parser.js text.js

sentence_parser.js
¶

(Rule-based sentence boundary segmentation) - chop given text into its proper sentences. Ignore periods/questions/exclamations used in acronyms/abbreviations/numbers, etc. @spencermountain 2015 MIT
```
'use strict';
let abbreviations = require('../data/abbreviations').abbreviations;

const sentence_parser = function(text) {
  const sentences = [];
```

first do a greedy-split..

  const chunks = text.split(/(\S.+?[.\?!])(?=\s+|$|")/g);

detection of non-sentence chunks

  const abbrev_reg = new RegExp('\\b(' + abbreviations.join('|') + ')[.!?] ?$', 'i');
  const acronym_reg = new RegExp('[ |\.][A-Z]\.?$', 'i');
  const elipses_reg = new RegExp('\\.\\.\\.*$');

loop through these chunks, and join the non-sentence chunks back together..

  const chunks_length = chunks.length;
  for (let i = 0; i < chunks_length; i++) {
    if (chunks[i]) {

trim whitespace

      chunks[i] = chunks[i].replace(/^\s+|\s+$/g, '');

should this chunk be combined with the next one?

      if (chunks[i + 1] && (chunks[i].match(abbrev_reg) || chunks[i].match(acronym_reg) || chunks[i].match(elipses_reg))) {
        chunks[i + 1] = ((chunks[i] || '') + ' ' + (chunks[i + 1] || '')).replace(/ +/g, ' ');
      } else if (chunks[i] && chunks[i].length > 0) { //this chunk is a proper sentence..
        sentences.push(chunks[i]);
        chunks[i] = '';
      }
    }
  }

if we never got a sentence, return the given text

  if (sentences.length === 0) {
    return [text];
  }

  return sentences;
};

module.exports = sentence_parser;

¶

console.log(sentence_parser(‘For example. This doesn\’t work for the US’));