Language Detection: statistical text analisys. Submit string or textfile to analyse its language. Analisys is based on so-called "fingerprints" a .lm file (text) with the pattern for a language. Download existing fingerprints in order not to generate one for each language. Detection of the analysed string will always return the language of the best matching fingerprint. So you need the fingerprint pattern of as many languages as possible. Also note: detection depends also on encoding
Fingerprint-listing [download .ZIP] more info about Statistical Text Analysis outline1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
<?php
/**
* php class LangDetect
* statistical text analisys of strings and text files
*
*
* @date 2010-02-10
* @author reto fassbind
* @URL http://www.boxoffice.ch/myclass/langDetect/langDetect_HL.php
* @desc Submit a text in a foreign language you don't recognize
* and find out what language it is.
* More information @ http://www.boxoffice.ch/pseudo/
* Free to use
*
* FINGERPRINTS:
* List http://www.boxoffice.ch/pseudo/LM_orig/
* Download http://www.boxoffice.ch/pseudo/finger_prints.zip
*
* Note: you need to donwload all available "fingerprints" for a working language detection!
* Otherwise you will have to generate your onw fingerprints first.
* The result of LangDetect will always give the language of the best matching fingerprint. So it is
* mandatory to have fingerprints of as many languages (and different encodings) as possible!
*/
class LangDetect
{
private $dir_fingerprints; // directory of fingerprint-files
private $fingerprints = array(); // array of fingerprints: array(language => array(ngrams))
private $ngrams = array(); // most frequent ngrams of a given text
private $string = '';
private $filepath = '';
const MAX_CHARS_PER_NGRAM = 4; // max length n-gram
const MAX_NGRAMS_PER_FINGERPRINT = 400; // default number ngrmas of fingerprint
const MAX_NGRAMS_PER_STRING = 400; // default number ngrmas of analysed test
const EXTENSION_FINGERPRINT = 'lm'; // extension of fingerprint library files
/**
* Sets the path to the DIR with all the fingerprint files
*/
public function setDirFingerprints($path)
{
$this->dir_fingerprints = $path;
}
/**
* Set a (new) string that can be analyzed later
*/
public function setString($string)
{
$this->string = $string;
}
/**
* Set path of a (text) file to analyse
*/
public function setFilepath($filepath)
{
$this->filepath = $filepath;
}
/**
* Get array of ngrams
*/
public function getNgrams()
{
if ($this->string) {
return ($this->ngrams) ? $this->ngrams : $this->chopStringToNgrams();
} else {
return Null;
}
}
/**
* Get the 2-dimensional array of all the fingerprints library
*/
public function getFingerprints()
{
if (!count($this->fingerprints)) {
$this->readOutFingerprints();
}
return $this->fingerprints;
}
/**
* Read out the content of a text file
* @return string
*/
public function getFilecontent()
{
if ($this->filepath) {
$this->readFileToString($filepath);
return $this->string;
} else {
return Null;
}
}
/**
* Analyze a text to find out what langauge it is
* @param string $string the text to be analyze
* @param boolean $type only language or whole array result?
* @return mixed (array or string) of languages (best matching first)
*/
public function analyseString($string, $type = false)
{
if (!$string) return Null;
$this->string = $string;
$this->readOutFingerprints();
$this->chopStringToNgrams();
$result = $this->doLanguageMatching();
if ($type) {
return $result;
} else {
$languages = array_keys($result);
return $languages[0];
}
}
/**
* Analyze a (text) file to find out in what langauge it was written in
* @param string $filepath the path to the file
* @param boolean $type only language or whole array result?
* @return mixed (array or string) of languages (best matching first)
*/
public function analyseFile($filepath, $type = false)
{
if (!$filepath) return Null;
$this->readFileToString($filepath);
$this->readOutFingerprints();
$this->chopStringToNgrams();
$result = $this->doLanguageMatching();
if ($type) {
return $result;
} else {
$languages = array_keys($result);
return $languages[0];
}
}
/**
* Generate new fingerprints from one or many text-files (default: txt)
* that reside inside a given source directory. The new fingerprints will be saved
* to the fingerprint direcotry (target). The name of the text file will be used as the name of
* the language of the fingerprint.
* @param string $dir path of the directory the text source files
* @param string $extension the file extension source files (default txt)
* @return string $report textual output what new fingerprints were genereated
*/
public function generateFingerprints($dir, $extension = 'txt')
{
$report = '*** No Files were Generated: ';
if (is_dir($dir)) {
if (is_dir($this->dir_fingerprints)) {
$pattern = "*." . $extension;
chdir($dir);
$files = glob($pattern);
$count = 1;
if (count($files) > 0) {
$report = 'Generated file(s): ';
foreach ($files as $file) {
$this->readFileToString($file);
$filename = basename($file, "." . $extension) .
"." . self::EXTENSION_FINGERPRINT;
$this->chopStringToNgrams();
$new_fingerprint_file = $this->dir_fingerprints . $filename;
$handle = fopen($new_fingerprint_file, 'w');
foreach ($this->ngrams as $key => $ngram) {
$line = $ngram . "\t " . ($key + 1) . "\n";
fwrite($handle, $line);
}
fclose($handle);
$report .= "\n ***[$count] generated: " . $filename;
$count++;
}
} else {
$report .= "\n No files found in the source directory";
}
} else {
$report .= "\n Target directory of the fingerprints was not found";
}
} else {
$report .= "\n Source directory was not found (generating)";
}
return $report;
}
/**
* creates a two-dimensional array of all the fingerprints inside the fingerprint
* directory. First dimension is the language, the second dimension are the top
* 400 ngrams
* @return void
*/
private function readOutFingerprints()
{
chdir($this->dir_fingerprints);
$fingerprint_files = glob("*." . self::EXTENSION_FINGERPRINT);
foreach ($fingerprint_files as $fingerprint_file) {
if (is_file($fingerprint_file)) {
$language = basename($fingerprint_file, "." . self::EXTENSION_FINGERPRINT);
$handle = fopen($fingerprint_file, 'r');
for ($i = 0; $i < self::MAX_NGRAMS_PER_FINGERPRINT; $i++) {
$line = fgets($handle);
$parts = explode(" ", $line);
$fingerprints[$language][] = trim($parts[0]); //$part[0] is a ngram
}
}
}
$this->fingerprints = $fingerprints;
}
/**
* Read out all text from a file (e.g. .txt) and put it
* into a string
* @param string $file path and name of a source file
* @return void
*/
private function readFileToString($file)
{
$string = '';
if (is_file($file)) {
$handle = fopen($file, 'r');
while (!feof($handle)) {
$line = trim(fgets($handle, 528288)); //line with max length of 2^19
if ($line != "") {
$string .= " ". $line;
$line_num++;
}
}
fclose($handle);
}
$this->string = $string;
}
/**
* Produce an 1-dim array of ngrams of a given string
* @return void
*/
private function chopStringToNgrams()
{
$ngrams_raw = array();
$words = explode(" ", $this->string);
foreach ($words as $word) {
$word = "_" . $word . "_";
for ($i = 0; $i < strlen($word); $i++){ //start position within word
for ($s = 1; $s <= self::MAX_CHARS_PER_NGRAM; $s++) { //length of ngram
if (($i + $s) <= strlen($word)) { //length depends on postion
$ngrams_raw[] = substr($word, $i, $s);
}
}
}
}
// group identical ngrams together, and order them in array ,
// by frequency of appearance, only use most frequent ones, swapt key with value
$ngrams_raw = array_count_values($ngrams_raw);
arsort($ngrams_raw);
$ngrams_raw = array_slice($ngrams_raw, 0, self::MAX_NGRAMS_PER_STRING);
$ngrams = array();
foreach ($ngrams_raw as $ngram => $countvalue){
$ngrams[] = $ngram;
}
$this->ngrams = $ngrams;
}
/**
* Compare ngram-array of to be analyzed text to each array of fingerprints
* and return an sorted array of languages that contain best maching one first
* @return array $result with key = language (fingerprint-name) and val = deviation
*/
private function doLanguageMatching()
{
$result = array();
// iterate over each fingerprint in the library dir
foreach ($this->fingerprints as $language => $fingerprint) {
$delta = 0;
// iterate over each ngram of the to be analyzed string
foreach ($this->ngrams as $rank => $ngram) {
//match
if (in_array($ngram, $fingerprint)) {
$delta += abs($rank - array_search($ngram, $fingerprint));
//no match
} else {
$delta += self::MAX_NGRAMS_PER_FINGERPRINT;
}
}
$result[$language] = $delta;
}
asort($result);
return $result;
}
} // end of class
?>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
<?php
/**
* Test script using PHP CLASS LangDetect
*
* @date 2010-02-10
* @author reto fassbind
* @desc test script using PHP CLASS LangDetect
*
* @note you need to donwload all available "fingerprints" for a working language detection!
*
* @url FINGERPRINTS:
* listing http://www.boxoffice.ch/pseudo/LM_orig/
* download http://www.boxoffice.ch/pseudo/finger_prints.zip
*
*/
// 1. use LangDetect and set path to the fingerprints (mandatory for analysis)
include('LangDetect.php');
$ld = new LangDetect();
$ld->setDirFingerprints($_SERVER['DOCUMENT_ROOT'] . '/path/to/your/fingerprint/lm/files');
// 2. Analyse Language of given String
$input_es = "No tengo mucho tiempo para pensar lo que las palabras quieren decir";
echo '<hr>** 1.) The language of the string is: ' . $ld->analyseString($input_es);
echo "<br />";
// 3. Analyse Language of given (text) file
$file_a = $_SERVER['DOCUMENT_ROOT'] . '/textfiles/test.txt';
echo '<hr>** 2.) File test.txt analyzed language: ' . $ld->analyseFile($file_a);
echo "<br />";
// 4. Generate new finger-prints of files in source folder and save them in fingerpritns dir
$dir_source = $_SERVER['DOCUMENT_ROOT'] . '/new_languages_text_to_generate/';
echo "<hr>** 3.) The following fingerprints have been generated from file(s) in :<br />";
echo "source: " . $dir_source . "<br />";
echo $ld->generateFingerprints($dir_source);
echo "<hr>";
//getters / setter
// // get array of all ngrams of a given string
// echo '<h1>The Ngram Array of an english string</h1>';
// $input_en = "Just comment in or out some short text examples";
// $t = new LangDetect();
// $t->setString($input_en);
// $t->getNgrams();
//
// // get array of all fingerpritns
// echo '<h1>The Fingerpritnt (loooong array9</h1>';
// $dir = $_SERVER['DOCUMENT_ROOT'] . '/fingerprints/';
// $ld->setDirFingerprints($dir);
// //print_r($ld->getFingerprints());
//
//
// // get text content of a file
// echo '<h1>Display content of a textfile</h1>';
// $file_b = $_SERVER['DOCUMENT_ROOT'] . '/textfiles/emule_readme.txt';
// $ld->setFilepath($file_b);
// echo $ld->getFilecontent();
?>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
<?php
class LangDetect {
//don't change unless you use your own fingerprints
var $ng_max_chars = 4; //maximum of an n-gram (is a 1to4-grams here)
var $ng_number_lm = 400; //default nb of ngrams in LM-fingerprints
//Path LM-files
//var $dir = $_SERVER['DOCUMENT_ROOT'].'/synchNow/langdetect/finger_prints/';
//var $dir = './langdetect/finger_prints/'; //RELATIV TO CALLING SCRIPT
//reasonable defaults
var $ng_number_sub = 350; //default nb of ngrams created from analyzed text
var $max_delta = 140000; //stop evaluation deviate strongly
var $limit_lines = 100; //limit # line of text-file used (-1 = all lines)
//Constructor: input= string or txt-file,
function LangDetect($input, $sec = false, $dir_prints= false){
//echo '<br>'.$input.'<br>';
$this->input = $input;
if ($sec == false) {
$this->result_type = 1;
$this->dir = './finger_prints/';
}
if ($sec != false) {
$this->result_type = $sec;
if ($sec == 'g') {
$this->ng_number_sub = $this->ng_number_lm;
$this->dir_generate = $input;
} elseif ($sec != 1 && $sec != -1) {
echo "<br>***Invalid 2nd Argument (1 or -1 to analyze, 'g' for Generation)<br>";
}
if ($dir_prints !=false){
$this->dir = $dir_prints;
} else {
$this->dir = './finger_prints/';
}
}
}
// MAIN- analyze string or text-file
function analyze() {
if (substr($this->input, -4, 4) == '.txt') {
//echo "<br>*** analyzing a text-file ******<br>";
$this->string_readfile = $this->input;
$this->extractText();
} else {
$this->string_used = $this->input;
//echo "<br>*** analyzing a string ******<br>";
}
if(!empty($this->string_used)) {
$this->getFingerprint();
$this->createNGrams();
if ($this->result_type == 1){//single result
return $this->compareNGramsOne();
} elseif ($this->result_type == -1){ //result-array
return $this->compareNGrams();
} else {
return "<br>*** Error: 2nd Argument must be either 1 or -1<br>";
}
} else {
return "*** Empty Text String /or wrong path/name of text file*****<br>";
}
}
// MAIN- create Fingerprint(s) of text-file(s) in $dir_generate
function Generate() {
echo "<br>***Generating Fingerprints in: ". $this->dir_generate ."<br>";
if (is_dir($this->dir_generate)) {
$pattern = "*.txt";
chdir($this->dir_generate);
$files = glob($pattern);
$count = 1;
foreach ($files as $this->string_readfile) {
$this->extractText();
$filename = basename($this->string_readfile, ".txt"). ".lm";
$new_lm_array = $this->createNGrams();
$new_lm_file = $this->dir_generate . $filename;
$handle = fopen($new_lm_file, 'w');
foreach ($new_lm_array as $key => $ngram) {
$line = $ngram ."\t ". ($key+1) ."\n";
//echo "ja<br>";
fwrite($handle, $line);
}
fclose($handle);
echo "<br>***[$count] generated: ". $filename;
$count++;
}
} else {
if(empty($this->dir_generate)) {
echo "<br>*** Use <b>'g'</b> as 2nd Argument when Generating finger-pritns<br>";
} else {
echo "<br>*** ERROR: Directory does not exist!<br>";
}
}
}
//-------------------------------//----------------------------------------//
//get multiple ngram-array of all LM-files in LM-DIR
function getFingerprint() {
$pattern = "*.lm";
chdir($this->dir);
$files = glob($pattern);
foreach ($files as $readfile) {
if (is_file($readfile)) {
$bsnm = basename($readfile, ".lm");
$handle = fopen($readfile, 'r');
for ($i=0; $i < $this->ng_number_lm; $i++) {
$line = fgets($handle);
$part = explode(" ", $line);
$lm[$bsnm][]= trim($part[0]);
}
} else {
echo " *** Pls check this LM -file: ". basename($readfile);
echo "<br> *** Path". $readfile;
}
}
$this->lm_ng = $lm;
/*
echo "HAllo";
echo "<pre>\n";
print_r($this->lm_ng);
echo "</pre>\n";
*/
return $lm;
}
//-------------------------------//----------------------------------------//
/* create ngram-array of given string */
function createNGrams($string=false) {
if ($string) {
$this->string_used = $string;
}
$array_words = explode(" ", $this->string_used);
foreach($array_words as $word) {
$word = "_". $word . "_";
$word_size = strlen($word);
for ($i=0; $i < $word_size; $i++){ //start position within word
for ($s=1; $s<($this->ng_max_chars + 1); $s++) { //length of ngram
if (($i + $s) < $word_size + 1) { //length depends on postion
$array_ngram[] = substr($word, $i, $s);
}
}
}
}
//count-> value(frequency, int)... key(ngram, string)
$blub = array_count_values($array_ngram);
//sort array by value(frequency) desc
arsort($blub);
//use only top frequent ngrams (def by $ng_number)
$top = array_slice($blub, 0, $this->ng_number_sub);
foreach ($top as $keyvar => $valvar){
$blubber_sub_ng[] = $keyvar;
}
$this->sub_ng = $blubber_sub_ng;
return $blubber_sub_ng;
}
//-------------------------------//----------------------------------------//
/* compare ngrams: Textinput vs lm-files.
Returns array of lm basenames (languages) with lowest deviation */
function compareNGrams() {
$limit = $this->max_delta;
foreach ($this->lm_ng as $lm_basename => $language) {
$delta = 0;
//compare each ngram of input text to current lm-array
foreach ($this->sub_ng as $key => $existing_ngram){
//match
if(in_array($existing_ngram, $language)) {
$delta += abs($key - array_search($existing_ngram, $language));
//no match
} else {
$delta += 400;
}
//abort: this language already differs too much
if ($delta > $this->max_delta) {
break;
}
} // End comparison with current language
//include only non-aborted languages in result array
if ($delta < ($this->max_delta)-400) {
$result[$lm_basename] = $delta;
}
} //End comparioson all languages
if(!isset($result)) {
$result = "sorry nothing no lang found";
} else {
asort($result);
}
return $result;
}
/* VARIATION- COMPARE ng's - Return 1 LANGUAGE only */
function compareNGramsOne() {
$limit = 160000;
foreach ($this->lm_ng as $lm_basename => $language) {
$delta = 0;
foreach ($this->sub_ng as $key => $existing_ngram){
if(in_array($existing_ngram, $language)) {
$delta += abs($key - array_search($existing_ngram, $language));
} else {
$delta += 400;
}
if ($delta > $limit) {
break;
}
}
if ($delta < $limit) {
$result[$lm_basename] = $delta;
$limit = $delta; //lower limit
}
}
if(!isset($result)) {
$result_first = "sorry nothing no lang found";
} else {
asort($result);
//basename of best matching lm file
list($result_first, $ignore) = each($result);
}
return $result_first;
}
//-------------------------------//----------------------------------------//
/* read out text from regular text file */
function extractText() {
$blu_string = '';
if (is_file($this->string_readfile)) {
$handle = fopen($this->string_readfile, 'r');
$line_num = 1;
while (!feof($handle)) {
//default -1 (read all lines)
if ($this->limit_lines == $line_num){
break;
}
//line with max length of 2^19
$line = trim(fgets($handle, 528288));
if ($line != "") {
$blu_string .= " ". $line;
$line_num++;
}
}
fclose($handle);
} else {echo "*** Text file NOT FOUND<br>";}
//echo "<p>$blu_string</p>";
$this->string_used = $blu_string;
return $blu_string;
}
//-------------------------------//----------------------------------------//
}
?>