Remove UNICODE Characters
Code Properties
- Language: Python
- Concept: String Encoding
Overview
Simple technique to remove non-ASCII characters (emojis, special symbols) from Python strings using encoding.
Code
text = "Happy Holi 😀 May this festival bring happiness 🥰"
clean_text = text.encode('ascii', 'ignore').decode()
print(clean_text)
# Output: Happy Holi May this festival bring happiness Usage
def remove_unicode(text: str) -> str:
"""Remove non-ASCII characters from string."""
return text.encode('ascii', 'ignore').decode()
# alternative using regex
import re
def remove_emoji(text: str) -> str:
"""Remove emoji characters using regex."""
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags
"]+",
flags=re.UNICODE
)
return emoji_pattern.sub('', text)Appendix
Note created on 2024-04-23 and last modified on 2024-12-31.
See Also
Backlinks
(c) No Clocks, LLC | 2024