• Jump To … +
    abbreviations.js adjectives.js convertables.js dates.js demonyms.js firstnames.js honourifics.js irregular_nouns.js irregular_verbs.js misc.js multiples.js numbers.js organisations.js phrasal_verbs.js places.js uncountables.js verbs.js fns.js index.js lexicon.js negate.js passive_voice.js contractions.js fancy_lumping.js grammar_rules.js parts_of_speech.js phrasal_verbs.js tagger.js word_rules.js question.js sentence.js statement.js tense.js adjective.js to_adverb.js to_comparative.js to_noun.js to_superlative.js adverb.js to_adjective.js is_acronym.js article.js date.js date_rules.js is_date.js parse_date.js is_plural.js is_uncountable.js noun.js is_organisation.js organisation.js gender.js is_person.js parse_name.js person.js is_place.js place.js pluralize.js pronoun.js singularize.js is_value.js numbers.js to_number.js units.js value.js term.js conjugate.js from_infinitive.js predict_form.js suffix_rules.js to_actor.js to_infinitive.js negate.js verb.js sentence_parser.js text.js
  • sentence_parser.js

  • ¶

    (Rule-based sentence boundary segmentation) - chop given text into its proper sentences. Ignore periods/questions/exclamations used in acronyms/abbreviations/numbers, etc. @spencermountain 2015 MIT

    'use strict';
    let abbreviations = require('../data/abbreviations').abbreviations;
    
    const sentence_parser = function(text) {
      const sentences = [];
  • ¶

    first do a greedy-split..

      const chunks = text.split(/(\S.+?[.\?!])(?=\s+|$|")/g);
  • ¶

    detection of non-sentence chunks

      const abbrev_reg = new RegExp('\\b(' + abbreviations.join('|') + ')[.!?] ?$', 'i');
      const acronym_reg = new RegExp('[ |\.][A-Z]\.?$', 'i');
      const elipses_reg = new RegExp('\\.\\.\\.*$');
  • ¶

    loop through these chunks, and join the non-sentence chunks back together..

      const chunks_length = chunks.length;
      for (let i = 0; i < chunks_length; i++) {
        if (chunks[i]) {
  • ¶

    trim whitespace

          chunks[i] = chunks[i].replace(/^\s+|\s+$/g, '');
  • ¶

    should this chunk be combined with the next one?

          if (chunks[i + 1] && (chunks[i].match(abbrev_reg) || chunks[i].match(acronym_reg) || chunks[i].match(elipses_reg))) {
            chunks[i + 1] = ((chunks[i] || '') + ' ' + (chunks[i + 1] || '')).replace(/ +/g, ' ');
          } else if (chunks[i] && chunks[i].length > 0) { //this chunk is a proper sentence..
            sentences.push(chunks[i]);
            chunks[i] = '';
          }
        }
      }
  • ¶

    if we never got a sentence, return the given text

      if (sentences.length === 0) {
        return [text];
      }
    
      return sentences;
    };
    
    module.exports = sentence_parser;
  • ¶

    console.log(sentence_parser(‘For example. This doesn\’t work for the US’));