Source code for CrackingCodes.Ch11.detectEnglish

"""Detect English Module

Provides functions to determine whether a given string is in the English language.

Attributes:
    UPPERLETTERS (str): String containing all latin-based letters in uppercase.
    LETTERS_AND_SPACE (str): String containing upper and lowercase letters as well as space, newline, and tab.
    DICTIONARY_FILE (str): String containing absolute path of dictionary.txt file.
    ENGLISH_WORDS (dict): Dictionary containing all words from dictionary.txt as keys.

Example:
    >>> import pythontutorials.books.CrackingCodes.Ch11.detectEnglish as detectEnglish
    >>> someString = 'Enthusiasm is contagious. Not having enthusiasm is also contagious.'
    >>> detectEnglish.isEnglish(someString)  # Returns True or False
    True

Note:
    * https://www.nostarch.com/crackingcodes/ (BSD Licensed)
    * There must be a "dictionary.txt" file in this directory with all
      English words in it, one word per line. You can download this from
      https://www.nostarch.com/crackingcodes/.
"""

UPPERLETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + ' \t\n'
DICTIONARY_FILE = '/home/jose/PycharmProjects/python-tutorials/pythontutorials/books/CrackingCodes/Ch11/dictionary.txt'


[docs]def loadDictionary() -> dict:
    """Load dictionary file

    Loads dictionary.txt file and creates a dictionary with all words as keys.

    Returns:
        Dictionary with all words in dictionary.txt as keys.
    """
    dictionaryFile = open(DICTIONARY_FILE)
    englishWords = {}
    for word in dictionaryFile.read().split('\n'):
        englishWords[word] = None
    dictionaryFile.close()
    return englishWords


ENGLISH_WORDS = loadDictionary()


[docs]def getEnglishCount(message: str) -> float:
    """Get count of English words

    For given message, counts number of words in English
    dictionary and returns ratio of English words out of total words.

    Args:
        message: String with message to check for English words.

    Returns:
         Ratio of number of English words / total number of words.
    """
    message = message.upper()
    message = removeNonLetters(message)
    possibleWords = message.split()

    if not possibleWords:  # Given [] = False, if possibleWords = [] then if not possibleWords = True
        return 0.0  # No words at all, so return 0.0

    matches = 0
    for word in possibleWords:
        if word in ENGLISH_WORDS:
            matches += 1
    return float(matches) / len(possibleWords)


[docs]def removeNonLetters(message: str) -> str:
    """Removes non-letters

    Removes non-letter characters from given message.

    Args:
        message: String with message to remove non-letter characters from.

    Returns:
        New string with non-letter characters removed.
    """
    lettersOnly = []
    for symbol in message:
        if symbol in LETTERS_AND_SPACE:
            lettersOnly.append(symbol)
    return ''.join(lettersOnly)


[docs]def isEnglish(message: str, wordPercentage: int=20, letterPercentage: int=85) -> bool:
    """Determines whether message is English

    Using given word percentage and letter percentage, determines if a given message is in the English language.

    Args:
        message: String containing message to determine if it is English.
        wordPercentage: Integer representing percentage of words in message that must be English.
        letterPercentage: Integer representing percentage of characters in message that must be letters or spaces.

    Returns:
        True if message is in English language, False otherwise.

    Note:
        * By default, 20% of the words must exist in the dictionary file, and
          85% of all the characters in the message must be letters or spaces
          (not punctuation or numbers).
    """
    wordsMatch = getEnglishCount(message) * 100 >= wordPercentage
    numLetters = len(removeNonLetters(message))
    messageLettersPercentage = float(numLetters) / len(message) * 100
    lettersMatch = messageLettersPercentage >= letterPercentage
    return wordsMatch and lettersMatch