Skip to content

Api dua

clean_sentence_indonesia(text, to_lower=True, use_stemming=True, remove_stopwords=True, remove_punctuation=True, remove_numbers=True, remove_urls=True, remove_mentions=True, remove_hashtags=True, remove_retweet=True, remove_extra_whitespace=True)

Cleans a text sentence by applying various preprocessing techniques.

Parameters:

Name Type Description Default
text str

The input text sentence.

required
to_lower bool

Whether to convert the text to lowercase. Defaults to True.

True
use_stemming bool

Whether to apply stemming to words. Defaults to True.

True
remove_stopwords bool

Whether to remove Indonesian stop words. Defaults to True.

True
remove_punctuation bool

Whether to remove punctuation characters. Defaults to True.

True
remove_numbers bool

Whether to remove numeric characters. Defaults to True.

True
remove_urls bool

Whether to remove URLs. Defaults to True.

True
remove_mentions bool

Whether to remove mentions (starting with "@"). Defaults to True.

True
remove_hashtags bool

Whether to remove hashtags (starting with "#"). Defaults to True.

True
remove_retweet bool

Whether to remove "RT" at the beginning of retweets. Defaults to True.

True
remove_extra_whitespace bool

Whether to remove extra whitespace characters. Defaults to True.

True

Returns:

Name Type Description
str str

The cleaned text sentence.

Examples:

>>> text = "RT @canggih: Mereka lihat ini: https://example.com"
>>> cleaned_text = clean_sentence_indonesia(text, remove_urls=True, remove_mentions=True, remove_hashtags=True)
>>> print(cleaned_text)
mereka lihat ini
Notes
  • This function uses the Sastrawi library for stemming.
  • Consider adjusting the parameters based on your specific requirements.
Source code in kedatatext/preprocessing.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def clean_sentence_indonesia(
    text: str,
    to_lower: bool = True,
    use_stemming: bool = True,
    remove_stopwords: bool = True,
    remove_punctuation: bool = True,
    remove_numbers: bool = True,
    remove_urls: bool = True,
    remove_mentions: bool = True,
    remove_hashtags: bool = True,
    remove_retweet: bool = True,
    remove_extra_whitespace: bool = True,
) -> str:
    """
    Cleans a text sentence by applying various preprocessing techniques.

    Args:
        text (str): The input text sentence.
        to_lower (bool, optional): Whether to convert the text to lowercase. Defaults to True.
        use_stemming (bool, optional): Whether to apply stemming to words. Defaults to True.
        remove_stopwords (bool, optional): Whether to remove Indonesian stop words. Defaults to True.
        remove_punctuation (bool, optional): Whether to remove punctuation characters. Defaults to True.
        remove_numbers (bool, optional): Whether to remove numeric characters. Defaults to True.
        remove_urls (bool, optional): Whether to remove URLs. Defaults to True.
        remove_mentions (bool, optional): Whether to remove mentions (starting with "@"). Defaults to True.
        remove_hashtags (bool, optional): Whether to remove hashtags (starting with "#"). Defaults to True.
        remove_retweet (bool, optional): Whether to remove "RT" at the beginning of retweets. Defaults to True.
        remove_extra_whitespace (bool, optional): Whether to remove extra whitespace characters. Defaults to True.

    Returns:
        str: The cleaned text sentence.

    Examples:
        >>> text = "RT @canggih: Mereka lihat ini: https://example.com"
        >>> cleaned_text = clean_sentence_indonesia(text, remove_urls=True, remove_mentions=True, remove_hashtags=True)
        >>> print(cleaned_text)
        mereka lihat ini

    Notes:
        - This function uses the `Sastrawi` library for stemming.
        - Consider adjusting the parameters based on your specific requirements.
    """

    # Remove RT
    if remove_retweet:
        if text.startswith("RT "):
            text = text[3:].strip()

    # Remove URLs
    if remove_urls:
        text = re.sub(r"http\S+", "", text)

    # Remove mentions
    if remove_mentions:
        text = re.sub("@[A-Za-z0-9]+", "", text)

    # Remove hashtags
    if remove_hashtags:
        text = re.sub("#", "", text)

    # Remove numbers
    if remove_numbers:
        text = re.sub(r"\d+", "", text)

    # Convert to lowercase
    if to_lower:
        text = text.lower()

    # Remove extra whitespace
    if remove_extra_whitespace:
        text = " ".join(text.split())

    # Remove punctuation
    if remove_punctuation:
        text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove stop words
    if remove_stopwords:
        stop_words = set(stopwords.words("indonesian"))
        word_tokens = word_tokenize(text)
        filtered_sentence = [word for word in word_tokens if word not in stop_words]
        text = " ".join(filtered_sentence)

    # Stemming
    if use_stemming:
        text = stemmer.stem(text)

    return text

clean_sentences_indonesia(list_of_texts, to_lower=True, use_stemming=True, remove_stopwords=True, remove_punctuation=True, remove_numbers=True, remove_urls=True, remove_mentions=True, remove_hashtags=True, remove_retweet=True, remove_extra_whitespace=True)

Cleans a list of text sentences by applying various preprocessing techniques.

Parameters:

Name Type Description Default
list_of_texts List[str]

A list of input text sentences.

required
to_lower bool

Whether to convert the text to lowercase. Defaults to True.

True
use_stemming bool

Whether to apply stemming to words. Defaults to True.

True
remove_stopwords bool

Whether to remove Indonesian stop words. Defaults to True.

True
remove_punctuation bool

Whether to remove punctuation characters. Defaults to True.

True
remove_numbers bool

Whether to remove numeric characters. Defaults to True.

True
remove_urls bool

Whether to remove URLs. Defaults to True.

True
remove_mentions bool

Whether to remove mentions (starting with "@"). Defaults to True.

True
remove_hashtags bool

Whether to remove hashtags (starting with "#"). Defaults to True.

True
remove_retweet bool

Whether to remove "RT" at the beginning of retweets. Defaults to True.

True
remove_extra_whitespace bool

Whether to remove extra whitespace characters. Defaults to True.

True

Returns:

Type Description
List[str]

List[str]: A list of cleaned text sentences.

Examples:

>>> list_of_texts = ["RT @canggih: Mereka lihat ini: https://example.com", "Halo 123!"]
>>> cleaned_texts = clean_sentences_indonesia(list_of_texts)
>>> print(cleaned_texts)
['mereka lihat ini', 'halo']
Source code in kedatatext/preprocessing.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def clean_sentences_indonesia(
    list_of_texts: List[str],
    to_lower: bool = True,
    use_stemming: bool = True,
    remove_stopwords: bool = True,
    remove_punctuation: bool = True,
    remove_numbers: bool = True,
    remove_urls: bool = True,
    remove_mentions: bool = True,
    remove_hashtags: bool = True,
    remove_retweet: bool = True,
    remove_extra_whitespace: bool = True,
) -> List[str]:
    """
    Cleans a list of text sentences by applying various preprocessing techniques.

    Args:
        list_of_texts (List[str]): A list of input text sentences.
        to_lower (bool, optional): Whether to convert the text to lowercase. Defaults to True.
        use_stemming (bool, optional): Whether to apply stemming to words. Defaults to True.
        remove_stopwords (bool, optional): Whether to remove Indonesian stop words. Defaults to True.
        remove_punctuation (bool, optional): Whether to remove punctuation characters. Defaults to True.
        remove_numbers (bool, optional): Whether to remove numeric characters. Defaults to True.
        remove_urls (bool, optional): Whether to remove URLs. Defaults to True.
        remove_mentions (bool, optional): Whether to remove mentions (starting with "@"). Defaults to True.
        remove_hashtags (bool, optional): Whether to remove hashtags (starting with "#"). Defaults to True.
        remove_retweet (bool, optional): Whether to remove "RT" at the beginning of retweets. Defaults to True.
        remove_extra_whitespace (bool, optional): Whether to remove extra whitespace characters. Defaults to True.

    Returns:
        List[str]: A list of cleaned text sentences.

    Examples:
        >>> list_of_texts = ["RT @canggih: Mereka lihat ini: https://example.com", "Halo 123!"]
        >>> cleaned_texts = clean_sentences_indonesia(list_of_texts)
        >>> print(cleaned_texts)
        ['mereka lihat ini', 'halo']
    """
    return [
        clean_sentence_indonesia(
            text,
            to_lower,
            use_stemming,
            remove_stopwords,
            remove_punctuation,
            remove_numbers,
            remove_urls,
            remove_mentions,
            remove_hashtags,
            remove_retweet,
            remove_extra_whitespace,
        )
        for text in list_of_texts
    ]