Languages

Extract Arabic Words From a Text File

Your rating: None Average: 5 (5 votes)
/**
*
*    Description  : A simple snippet to demonstrate how to extract Arabic Words in Vala programming language
*    Vala version : 0.7.8
*    Developed by : Emad Al-Bloushi
*    Date         : Mon 23 Nov, 2009
*    Compile with : valac --pkg gee-1.0 word.vala
*
**/
 
using Gee;
 
static int main (string[] args) {
 
	string filename = "arabic-file.txt";
	string content;
 
	StringBuilder word = new StringBuilder();
 
	var word_list = new HashSet<string> ();
 
 
  	try {
 
  		FileUtils.get_contents (filename,out content);
 
  	} catch (FileError e) {
 
  		stderr.printf("%s\n",e.message );
  		return 1;
  	}
 
	for (weak string s = content; s.get_char ()!=0 ; s = s.next_char ()) {
 
		unichar unichar_content = s.get_char ();
		UnicodeType unicode_type = unichar_content.type ();
 
			switch (unicode_type) {
 
				//case unicode_type.UPPERCASE_LETTER:
				//case unicode_type.LOWERCASE_LETTER:
				case unicode_type.OTHER_LETTER:
 
				case unicode_type.DECIMAL_NUMBER:
				case unicode_type.OTHER_NUMBER:
 
					if ( unichar_content == 'ۃ' || unichar_content == 'ة' ) {
						// append the last character to complete the word
						word.append_unichar(unichar_content);
						// add the word to the HashSet word_list object
						word_list.add(word.str);
						// empty and create new word object
						word = new StringBuilder();					
 
					} else {
						word.append_unichar(unichar_content);
					}
 
 
				break;
 
				case unicode_type.MODIFIER_LETTER:
					// append ARABIC TATWEEL character 0640
					if (unichar_content == 'ـ') {
						word.append_unichar(unichar_content);
					}		
				break;
 
				case unicode_type.NON_SPACING_MARK:
					word.append_unichar(unichar_content);			
				break;
 
				case unicode_type.FORMAT:
				case unicode_type.CONTROL:
 
				case unicode_type.SPACE_SEPARATOR:
				case unicode_type.LINE_SEPARATOR:
				case unicode_type.PARAGRAPH_SEPARATOR:
 
				case unicode_type.CONNECT_PUNCTUATION:
				case unicode_type.OPEN_PUNCTUATION:
				case unicode_type.CLOSE_PUNCTUATION:
				case unicode_type.INITIAL_PUNCTUATION:				
				case unicode_type.DASH_PUNCTUATION:
				case unicode_type.FINAL_PUNCTUATION:
				case unicode_type.OTHER_PUNCTUATION:
 
				case unicode_type.MATH_SYMBOL:
				case unicode_type.CURRENCY_SYMBOL:
				case unicode_type.OTHER_SYMBOL:
 
					// add the word to the HashSet word_list object
					word_list.add(word.str);
					// empty and create new word object
					word = new StringBuilder();
 
				break;				
 
 
			} // end of switch statement
 
	}
 
	word = new StringBuilder();
	foreach (string s in word_list ) {
		word.append(s+"\n");
		stdout.printf ("Word : %s\n", s);
        }
 
  	try {
 
 
		FileUtils.set_contents ("word_list.txt",word.str);
 
 
	} catch (FileError e) {
 
 
  		stderr.printf("%s\n",e.message );
  		return 1;
 
 
 
	}     
 
 
	return 0;
}