import java.io.*; import java.util.*; import java.util.regex.* ; public class ProcessTags{ public static void main(String[] args) { try{ if (args.length != 2) // print help { System.out.println("Usage is:"); System.out.println("java ProcessTags posfile taggedfile"); return; } BufferedReader posFile = new BufferedReader(new FileReader(args[0])); BufferedReader taggedFile = new BufferedReader(new FileReader(args[1])); int lineNumber = 1; String posLineTemp = posFile.readLine(); String taggedLineTemp = taggedFile.readLine(); StringTokenizer posTokens; StringTokenizer taggedTokens; String tempPosToken, tempPosWord; String tempTaggedToken, tempTaggedWord; System.out.println(""); Pattern p = Pattern.compile("[^0-9A-Za-z_.,]+"); Pattern sp = Pattern.compile(""); while(posLineTemp!=null && taggedLineTemp!=null) { // advance the lines over whitespace or non-lines Matcher m1 = p.matcher(taggedLineTemp); while ((m1.replaceAll("")).length() == 0) { taggedLineTemp = taggedFile.readLine(); m1 = p.matcher(taggedLineTemp); if (taggedLineTemp==null) { break; } } Matcher m2 = p.matcher(posLineTemp); while ((m2.replaceAll("")).length() == 0) { posLineTemp = posFile.readLine(); m2 = p.matcher(posLineTemp); if (posLineTemp==null) { break; } } // break loop if at the end now for either. if (posLineTemp==null && taggedLineTemp!=null) { System.out.println("Premature end of POS"); break; } else if (posLineTemp!=null && taggedLineTemp==null) { System.out.println("Premature end of tagged input"); break; } else if (posLineTemp==null && taggedLineTemp==null) { break; } // remove the tags -- they should *never* be on a line by themselves Matcher s = sp.matcher(taggedLineTemp); taggedLineTemp = s.replaceAll(""); s = sp.matcher(posLineTemp); posLineTemp = s.replaceAll(""); posTokens = new StringTokenizer(posLineTemp, "/ [_]<>="); taggedTokens = new StringTokenizer(taggedLineTemp, "/ [_]<>="); while (taggedTokens.hasMoreTokens() && posTokens.hasMoreTokens()) { tempTaggedWord = taggedTokens.nextToken(); tempPosWord = posTokens.nextToken(); if ((tempTaggedWord.equals("S") || tempTaggedWord.equals("((") || tempTaggedWord.equals("))")) && taggedTokens.hasMoreTokens()) tempTaggedWord = taggedTokens.nextToken(); if (!taggedTokens.hasMoreTokens() || !posTokens.hasMoreTokens()) break; tempTaggedToken = taggedTokens.nextToken(); tempPosToken = posTokens.nextToken(); // System.out.println(lineNumber); if(!tempTaggedToken.equals(tempPosToken) && tempTaggedWord.equals(tempPosWord)) { System.out.println("

"); System.out.println("On line " + lineNumber + " the tag for the word " + tempTaggedWord + " was " + tempTaggedToken + " instead of " + tempPosToken + "
"); System.out.println("The correct sentence tagging is: " + posLineTemp + "
"); System.out.println("The output sentence tagging was: " + taggedLineTemp + "
"); } } posLineTemp = posFile.readLine(); taggedLineTemp = taggedFile.readLine(); lineNumber = lineNumber+1; } } catch(Exception e) { e.printStackTrace(); } System.out.println(""); } }