포스테깅 한 csv파일을 스태밍 한 후 csv파일로 내보내기(make stemming csv file from csv file)

package st;

import java.io.File;

import opennlp.tools.cmdline.postag.POSModelLoader;

import opennlp.tools.postag.POSModel;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.io.PrintWriter;

import com.opencsv.CSVReader;

Porter stemmer in Java. The original paper is in

Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,

no. 3, pp 130-137,

See also http://www.tartarus.org/~martin/PorterStemmer

History:

Release 1

Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.

The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]

is then out outside the bounds of b.

Release 2

Similarly,

Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.

'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and

b[j] is then outside the bounds of b.

Release 3

Considerably revised 4/9/00 in the light of many helpful suggestions

from Brian Goetz of Quiotix Corporation (brian@quiotix.com).

Release 4

/**

* Stemmer, implementing the Porter Stemming Algorithm

* The Stemmer class transforms a word into its root form. The input

* word can be provided a character at time (by calling add()), or at once

* by calling one of the various stem(something) methods.

class Stemmer

{ private char[] b;

private int i, /* offset into b */

i_end, /* offset to end of stemmed word */

j, k;

private static final int INC = 50;

/* unit of size whereby b is increased */

public Stemmer()

{ b = new char[INC];

i = 0;

i_end = 0;

}

/**

* Add a character to the word being stemmed. When you are finished

* adding characters, you can call stem(void) to stem the word.

public void add(char ch)

{ if (i == b.length)

{ char[] new_b = new char[i+INC];

for (int c = 0; c < i; c++) new_b[c] = b[c];

b = new_b;

}

b[i++] = ch;

}

/** Adds wLen characters to the word being stemmed contained in a portion

* of a char[] array. This is like repeated calls of add(char ch), but

* faster.

public void add(char[] w, int wLen)

{ if (i+wLen >= b.length)

{ char[] new_b = new char[i+wLen+INC];

for (int c = 0; c < i; c++) new_b[c] = b[c];

b = new_b;

}

for (int c = 0; c < wLen; c++) b[i++] = w[c];

}

/**

* After a word has been stemmed, it can be retrieved by toString(),

* or a reference to the internal buffer can be retrieved by getResultBuffer

* and getResultLength (which is generally more efficient.)

public String toString() { return new String(b,0,i_end); }

/**

* Returns the length of the word resulting from the stemming process.

public int getResultLength() { return i_end; }

/**

* Returns a reference to a character buffer containing the results of

* the stemming process. You also need to consult getResultLength()

* to determine the length of the result.

public char[] getResultBuffer() { return b; }

public void stem()

{

k = i - 1;

if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }

i_end = k+1; i = 0;

}

private final void step1()

{ if (b[k] == 's')

{ if (ends("sses")) k -= 2; else

if (ends("ies")) setto("i"); else

if (b[k-1] != 's') k--;

}

if (ends("eed")) { if (m() > 0) k--; } else

if ((ends("ed") || ends("ing")) && vowelinstem())

{ k = j;

if (ends("at")) setto("ate"); else

if (ends("bl")) setto("ble"); else

if (ends("iz")) setto("ize"); else

if (doublec(k))

{ k--;

{ int ch = b[k];

if (ch == 'l' || ch == 's' || ch == 'z') k++;

}

else if (m() == 1 && cvc(k)) setto("e");

}

/* step2() turns terminal y to i when there is another vowel in the stem. */

private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }

/* step3() maps double suffices to single ones. so -ization ( = -ize plus

-ation) maps to -ize etc. note that the string before the suffix must give

m() > 0. */

private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])

{

case 'a': if (ends("ational")) { r("ate"); break; }

if (ends("tional")) { r("tion"); break; }

break;

case 'c': if (ends("enci")) { r("ence"); break; }

if (ends("anci")) { r("ance"); break; }

break;

case 'e': if (ends("izer")) { r("ize"); break; }

break;

case 'l': if (ends("bli")) { r("ble"); break; }

if (ends("alli")) { r("al"); break; }

if (ends("entli")) { r("ent"); break; }

if (ends("eli")) { r("e"); break; }

if (ends("ousli")) { r("ous"); break; }

break;

case 'o': if (ends("ization")) { r("ize"); break; }

if (ends("ation")) { r("ate"); break; }

if (ends("ator")) { r("ate"); break; }

break;

case 's': if (ends("alism")) { r("al"); break; }

if (ends("iveness")) { r("ive"); break; }

if (ends("fulness")) { r("ful"); break; }

if (ends("ousness")) { r("ous"); break; }

break;

case 't': if (ends("aliti")) { r("al"); break; }

if (ends("iviti")) { r("ive"); break; }

if (ends("biliti")) { r("ble"); break; }

break;

case 'g': if (ends("logi")) { r("log"); break; }

} }

/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */

private final void step4() { switch (b[k])

{

case 'e': if (ends("icate")) { r("ic"); break; }

if (ends("ative")) { r(""); break; }

if (ends("alize")) { r("al"); break; }

break;

case 'i': if (ends("iciti")) { r("ic"); break; }

break;

case 'l': if (ends("ical")) { r("ic"); break; }

if (ends("ful")) { r(""); break; }

break;

case 's': if (ends("ness")) { r(""); break; }

break;

} }

/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */

private final void step5()

{ if (k == 0) return; /* for Bug 1 */ switch (b[k-1])

{ case 'a': if (ends("al")) break; return;

case 'c': if (ends("ance")) break;

if (ends("ence")) break; return;

case 'e': if (ends("er")) break; return;

case 'i': if (ends("ic")) break; return;

case 'l': if (ends("able")) break;

if (ends("ible")) break; return;

case 'n': if (ends("ant")) break;

if (ends("ement")) break;

if (ends("ment")) break;

/* element etc. not stripped before the m */

if (ends("ent")) break; return;

case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;

/* j >= 0 fixes Bug 2 */

if (ends("ou")) break; return;

/* takes care of -ous */

case 's': if (ends("ism")) break; return;

case 't': if (ends("ate")) break;

if (ends("iti")) break; return;

case 'u': if (ends("ous")) break; return;

case 'v': if (ends("ive")) break; return;

case 'z': if (ends("ize")) break; return;

default: return;

}

if (m() > 1) k = j;

}

/* step6() removes a final -e if m() > 1. */

private final void step6()

{ j = k;

if (b[k] == 'e')

{ int a = m();

if (a > 1 || a == 1 && !cvc(k-1)) k--;

}

if (b[k] == 'l' && doublec(k) && m() > 1) k--;

}

private final boolean ends(String s)

{ int l = s.length();

int o = k-l+1;

if (o < 0) return false;

for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;

j = k-l;

return true;

}

private final void r(String s) { if (m() > 0) setto(s); }

/* setto(s) sets (j+1),...k to the characters in the string s, readjusting

k. */

private final void setto(String s)

{ int l = s.length();

int o = j+1;

for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);

k = j+l;

}

private final int m()

{ int n = 0;

int i = 0;

while(true)

{ if (i > j) return n;

if (! cons(i)) break; i++;

}

i++;

while(true)

{ while(true)

{ if (i > j) return n;

if (cons(i)) break;

i++;

}

i++;

n++;

while(true)

{ if (i > j) return n;

if (! cons(i)) break;

i++;

}

i++;

}

/* vowelinstem() is true <=> 0,...j contains a vowel */

private final boolean vowelinstem()

{ int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;

return false;

}

/* doublec(j) is true <=> j,(j-1) contain a double consonant. */

private final boolean doublec(int j)

{ if (j < 1) return false;

if (b[j] != b[j-1]) return false;

return cons(j);

}

/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant

and also if the second c is not w,x or y. this is used when trying to

restore an e at the end of a short word. e.g.

cav(e), lov(e), hop(e), crim(e), but

snow, box, tray.

private final boolean cvc(int i)

{ if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;

{ int ch = b[i];

if (ch == 'w' || ch == 'x' || ch == 'y') return false;

}

return true;

}

/* cons(i) is true <=> b[i] is a consonant. */

private final boolean cons(int i)

{ switch (b[i])

{ case 'a': case 'e': case 'i': case 'o': case 'u': return false;

case 'y': return (i==0) ? true : !cons(i-1);

default: return true;

}

/* m() measures the number of consonant sequences between 0 and j. if c is

a consonant sequence and v a vowel sequence, and <..> indicates arbitrary

presence,

<c><v> gives 0

<c>vc<v> gives 1

<c>vcvc<v> gives 2

<c>vcvcvc<v> gives 3

....

/* r(s) is used further down. */

/* step1() gets rid of plurals and -ed or -ing. e.g.

caresses -> caress

ponies -> poni

ties -> ti

caress -> caress

cats -> cat

feed -> feed

agreed -> agree

disabled -> disable

matting -> mat

mating -> mate

meeting -> meet

milling -> mill

messing -> mess

meetings -> meet

/** Stem the word placed into the Stemmer buffer through calls to add().

* Returns true if the stemming process resulted in a word different

* from the input. You can retrieve the result with

* getResultLength()/getResultBuffer() or toString().

/** Test program for demonstrating the Stemmer. It reads text from a

* a list of files, stems each word, and writes the result to standard

* output. Note that the word stemmed is expected to be in lower case:

* forcing lower case must be done outside the Stemmer class.

* Usage: Stemmer file-name file-name ...

//-----------------------------------------------------------------

public static void main(String[] args)throws IOException

{

PrintWriter pw = new PrintWriter(new File("stam1.csv"));

StringBuilder sb = new StringBuilder();

//csv 첫줄 추가

sb.append("id");

sb.append(',');

sb.append("abstract");

sb.append('\n');

char[] w = new char[501];

Stemmer s = new Stemmer();//스테머 객체 추가

for (int i = 0; i < args.length; i++)//args로 들어온 파일 크기많큼 for문 돌리기

try //FileNotFoundException 예외 처

{

FileInputStream in = new FileInputStream(args[i]); //파일 읽어 들여서 객체 만들

try //ioexception 예외처리

{

while(true) //예외 안에서 계속 돌기

{

int ch = in.read(); //inputstream 으로 부터 바이트단위로 데이터를 가져옴

if (Character.isLetter((char) ch)) //케릭터를 레터로 정의

{

int j = 0;

while(true)

{

ch = Character.toLowerCase((char) ch); // 소문자로 변경

w[j] = (char) ch;

if (j < 500) j++;

ch = in.read();

if (!Character.isLetter((char) ch))

{

/* or, to test add(char[] w, int j) */

/* to test add(char ch) */

for (int c = 0; c < j; c++)

s.add(w[c]); //스테머에 추가

/* s.add(w, j); */

// new StepforStem().stem();

s.stem(); //스탬 함수 호

{ String u;

/* and now, to test toString() : */

u = s.toString(); //스태밍 한 것을 스트링 u에 넣기

/* to test getResultBuffer(), getResultLength() : */

/* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

System.out.print(u); // 스태밍 끝난u 출력

sb.append(u);

// u.substring(0, 4).equals("nuart"){}

}

break;

}

if (ch < 0) break;

System.out.print((char)ch);

sb.append((char)ch);

}

catch (IOException e)

{ System.out.println("error reading " + args[i]);

break;

}

catch (FileNotFoundException e)

{ System.out.println("file " + args[i] + " not found");

break;

}

pw.flush();

pw.write(sb.toString());

pw.close();

}

class StepforStem

{

private char[] b;

private int i, /* offset into b */

i_end, /* offset to end of stemmed word */

j, k;

private static final int INC = 50;

public StepforStem()

{

}

저작자표시 비영리 변경금지 (새창열림)

'개발 > Big data' 카테고리의 다른 글

래피드마이너 설치 (0)	2017.07.27
stemming postagging 합친것 정리 (0)	2016.05.29
자바 소문자로 변환 (0)	2016.05.29
opennlp를 사용하여 csv파일을 가져와 명사 추출 하고 csv파일로 내보내기( extract noun from csv file with openlp) (0)	2016.05.01

개발주발

포스테깅 한 csv파일을 스태밍 한 후 csv파일로 내보내기(make stemming csv file from csv file)

'개발 > Big data' 카테고리의 다른 글

티스토리툴바

포스테깅 한 csv파일을 스태밍 한 후 csv파일로 내보내기(make stemming csv file from csv file)

'개발 > Big data' 카테고리의 다른 글

'개발/Big data' Related Articles

티스토리툴바