\documentclass[twoside,11pt]{article}
\usepackage{jmlr2e}
\usepackage{graphicx}
\usepackage{float}
\input{psfig.sty}

\jmlrheading{2}{2002}{639--668}{9/01}{3/02}{Be\'{a}ta Megyesi}
\ShortHeadings{Shallow Parsing with PoS Taggers and Linguistic Features}{Megyesi}
\firstpageno{639}

\newcommand{\comment}[1]{}
\newcommand{\uncomment}[1]{#1}

\title{Shallow Parsing with PoS Taggers and Linguistic Features}

\author{Be\'{a}ta Megyesi \email bea@speech.kth.se\\
\addr Centre for Speech Technology (CTT)\\
	Department of Speech, Music and Hearing\\
	KTH, Sweden\\
	Drottning Kristinas v\"{a}g 31\\
	SE--100 44, Stockholm, Sweden\\
}

\begin{document}

\editor{James Hammerton, Miles Osborne, Susan Armstrong and Walter Daelemans}

\maketitle
\begin{abstract}
Three data-driven publicly available part-of-speech taggers are applied to shallow parsing of Swedish texts. The phrase structure is represented by nine types of phrases in a hierarchical structure containing labels for every constituent type the token belongs to in the parse tree. The encoding is based on the concatenation of the phrase tags on the path from lowest to higher nodes. Various linguistic features are used in learning; the taggers are trained on the basis of lexical information only, part-of-speech only, and a combination of both, to predict the phrase structure of the tokens with or without part-of-speech. Special attention is directed to the taggers' sensitivity to different types of linguistic information included in learning, as well as the taggers' sensitivity to the size and the various types of training data sets. The method can be easily transferred to other languages. 

\end{abstract}

\begin{keywords}
  Chunking, Shallow parsing, Part-of-speech taggers, Hidden Markov models, Maximum entropy learning, Transformation-based learning
\end{keywords}

\section{Introduction}

Machine learning techniques in the last decade have permeated several areas of natural language processing (NLP). The reason is that a vast number of machine learning algorithms have proved to be able to learn from natural language data given a relatively small correctly annotated corpus. 
Therefore, machine learning algorithms make it possible to within a short period of time develop language resources---data analyzed on various linguistic levels---that are necessary for numerous applications in natural language processing.      

One of the most popular NLP areas that machine learning algorithms have been successfully applied to is part-of-speech (PoS) tagging, i.e.\ the annotation of words with the contextually appropriate PoS tags, often including morphological features. The data-driven algorithms that have been successfully applied to this task for several languages include, among others, hidden Markov modeling \citep{Brants00}, inductive logic programming (\citeauthor{Cussens98}, \citeyear{Cussens98}; \citeauthor{Eineborg00}, \citeyear{Eineborg00}), maximum entropy learning \citep{Ratna96}, memory-based learning (\citeauthor{Dael96}, \citeyear{Dael96}; \citeauthor{Zav99}, \citeyear{Zav99}), and transformation-based learning \citep{Brill94}. The main advantage with data-driven PoS taggers is that they are language and tag set independent and thereby easily applicable to new languages and domains. The average accuracy that is reported for state-of-the-art data-driven PoS taggers lies between 95\% and 98\% depending on the language type the taggers are trained and tested on. 

In the past years, some attempts also have been made to build data-driven shallow parsers. The main goal of the data-driven parsers is, above all, to find the phrase structure of the sentence and not, as one might think, to disambiguate words according to their context. The disambiguation is already taken care of by the PoS taggers which use some kind of background knowledge, i.e.\ parameters that tell the system to check the contextual environment of the current word and/or tag. 

As a first step in building corpus-based parsers, a considerable amount of research has been carried out to find syntactically related non-overlapping groups of words, so-called ``chunks'' \citep{Abney91}. A chunk is a major phrase category consisting of the phrasal head and its modifiers on the left hand side. The example below, borrowed from \citet{Tjong00}, illustrates three different chunk types ({\sc np}, {\sc vp} and {\sc pp}) for the sentence ``He reckons the current account deficit will narrow to only \pounds 1.8 billion in September'' shown in bracketing structure.

\\
\bigskip
\indent\indent[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] \\
\indent\indent[PP to ] [NP only \pounds 1.8 billion ] [PP in ] [NP September ].
\bigskip

Within the area of data-driven chunking, much attention has been directed to the development of recognition methods for simple, non-recursive noun phrases, also called base {\sc np} chunks (e.g.\ \citeauthor{Church88}, \citeyear{Church88}; \citeauthor{Cardie98}, \citeyear{Cardie98}; \citeauthor{Brants98}, \citeyear{Brants98}). These phrases play an important role in many application areas, such as information extraction and information retrieval, as well as in human language processing \citep{Gee83}. Research on the detection of other chunk types, such as prepositional phrases ({\sc pp}), adverb phrases ({\sc advp}), adjective phrases ({\sc adjp}) and verb clusters, by data-driven methods has also been carried out with promising results (see \citeauthor{Ramshaw95}, \citeyear{Ramshaw95}; \citeauthor{Arg98}, \citeyear{Arg98}; \citeauthor{Brants99}, \citeyear{Brants99}; \citeauthor{Buch99}, \citeyear{Buch99}; \citeauthor{Veen99}, \citeyear{Veen99}; \citeauthor{Osborne00}, \citeyear{Osborne00}; and \citeauthor{Megyesi011}, \citeyear{Megyesi011}). However, most of these chunkers only recognize a phrase up to its head word without finding the arguments on the right side of the head. For example, in the example above, the two {\sc pp}s do not include their {\sc np} arguments. Additionally, in almost all these studies with the exception of work by \citet{Brants99}, the internal phrase structure of the chunk is not analyzed. As we can see in the example sentence, the phrases inside the {\sc np} are not marked.   
Also, different studies use various linguistic information to find the chunks; some use PoS only without taking any lexical information under consideration, while some combine the words and their PoS in learning. 

It is also worth mentioning that the majority of studies on chunking have been focused on the development of data-driven chunkers/parsers for English, just as was the case with the part of speech tagging task a couple of years ago. The reason is mainly that there is a correctly parsed corpus for English, the Penn Treebank \citep{Marcus94}, while such a corpus is missing for most languages. Given this correctly parsed large data set, the development and evaluation of data-driven approaches become easier and more reliable.  

The motivating purpose of this work is to build a data-driven shallow parser without a great deal of human effort for Swedish, describing the whole constituent structure the word belongs to in a hierarchical fashion. Desirable properties of the shallow parser are as follows:

\begin{itemize}
\item easily trainable, fast and robust
\item corpus-based, i.e.\ data-driven, so that it can be applicable to various domains
\item having a hierarchical phrase representation so that it is capable of being used for many different applications
\end{itemize}

The fact that many data-driven PoS taggers are language and tag set independent, and the fact that these taggers have some implemented linguistic knowledge about the contextual environment of words and/or tags, lead to the thought that these PoS taggers can be assumed to be useful to parse texts, given some correctly chunked/parsed data, i.e.\ a treebank. Inspired by the success of the maximum entropy based data-driven PoS tagger, {\sc mxpost} \citep{Ratna96}, applied directly to chunk English \citep{Osborne00}, we will use three different data-driven PoS taggers as a basis for parsing Swedish texts. The PoS taggers are implementations of three algorithms: hidden Markov modeling \citep{Brants00}, maximum entropy learning \citep{Ratna96}, and transformation-based learning \citep{Brill94}. 

The aim of this study is, in particular, to find out what combinations of linguistic information are the most appropriate for the parsing task so that the taggers can efficiently learn to parse texts, and to find out what effects different kinds of linguistic information included in the training data has on the parsers in this processing. In addition, the taggers' sensitivity to the size of the training set is investigated, and an evaluation for real-world applications is carried out.  

The remainder of the paper is organized as follows: Section 2 gives an overview of previous studies performed on data-driven chunking; Section 3 presents the phrase structure representation, the training data and benchmark, as well as a brief description of the taggers that the parsers are built on; Section 4 describes the experiments on various linguistic features used in learning; Section 5 presents the experiments and the results; and finally, Section 6 concludes the paper and gives directions for further research.   

\section{Previous Work on Data-Driven Text Chunking}
\label{pw}
The concept of the chunk was introduced by \citet{Abney91}. He defines a chunk as ``a single content word surrounded by a constellation of function words, matching a fixed template''. He proposed that by dividing a sentence into meaningful, correlated sequences of words---chunks---and combining those into trees, we can build a parser which has psycholinguistic evidence in that it represents structures corresponding to pauses and intonation changes in speech. Abney's chunk parser consists of two steps; first the chunker finds potential chunks on the basis of PoS information, and then an attacher finds the correct chunk by resolving structural ambiguities on the basis of lexical information.         

Abney's pioneering work has influenced a lot of researchers. Several studies have been performed to develop data-driven chunkers as a first step to build parsers. One of the earliest studies on this topic was presented by \citet{Ramshaw95}. They used transformation-based learning \citep{Brill94} to locate chunks in texts by treating chunking as a tagging problem. The chunk structure was represented as tags attached to words, in a similar way as is done in data-driven PoS tagging. They performed experiments using two different chunk structure targets. The first target was to identify non-overlapping, non-recursive noun phrases, so called base {\sc np}s, as far as the nominal head, including determiners and adjectives, but not prepositional phrases or other types of arguments located after the head word. The tag set consisted of three types of tags: B for the first word of the base {\sc np}, I for the words inside the base {\sc np}, and O for the words outside of the chunk. The second target of their work was to partition sentences into non-overlapping noun-type (N) and verb-type (V) chunks in a similar fashion as was proposed by \citet{Abney91}. The noun-type chunks consisted of, among others, noun phrases as far as the nominal head, prepositional phrases including an {\sc np} argument, but not coordinating conjoined {\sc np}s. Each N and V type had two tags, depending on whether the word was initially positioned in the type or not, and an extra tag was reserved for punctuation marks. They used the parsed Wall Street Journal texts from Penn Treebank \citep{Marcus94} to automatically derive the chunk structure. They extended the templates of Brill's PoS tagger to include references up to two chunk tags, as well as to up to three words and/or their PoS tags. The result showed a precision of 93.1\% and a recall of 93.5\% for base {\sc np} chunks when trained on 950k words and tested on 50k words using lexical and PoS information. When lexical information was excluded, precision and recall decreased to 90.5\% and 90.7\% respectively. For the N and V partitioning, precision and recall rates are reported to be 88\% when training was performed on 200k words. Also, they pointed out that the size of the training set has a significant effect on the results.  

\citet{Arg98} used memory-based sequence learning to recognize {\sc np} and {\sc vp} chunks in PoS tagged texts. The same data set was used as in the study by \citet{Ramshaw95} but the learner was trained on PoS tag sequences containing bracketed chunk boundaries without including lexical information. They report precision and recall rates of 91.6\%.  
    

Other experiments on data-driven chunking were also performed with memory-based learning methods. \citet{Cardie98} presented a corpus-based approach for finding base {\sc np}s by using PoS tag sequences without any lexical information. They created grammar rules from the training data and improved the grammar by pruning it on another data set, using local repair heuristics that improved the precision without decreasing the recall. A further step of discarding the ten worst rules was also carried out without decreasing the precision. They achieved 94\% precision and recall on simple base {\sc np}s, and 91\% on more complex ones.  

\citet{Veen99}, also using a memory-based learning technique---{\sc igt}ree---\citep{Dael96}, described experiments on {\sc np}, {\sc vp} and {\sc pp} chunking using the Wall Street Journal for data and the {\sc bio} labels attached to each chunk type as it was proposed by \citet{Ramshaw95} and described above. He reported precision and recall rates between 94\% -- 95\%, and accuracy of 98\% and 99\% for {\sc np} and {\sc vp} chunks respectively. 

\citet{Buch99} used memory-based learning to assign grammatical relations (for example subject, object, etc.) to texts by first finding {\sc np}, {\sc vp}, {\sc pp}, {\sc adjp} and {\sc advp} chunks, and then using pairs of chunks to predict grammatical relations. The data-driven chunker was in turn applied in several steps. First, prepositions, {\sc np}, {\sc vp}, {\sc adjp} and {\sc advp} chunks were found simultaneously, then prepositions and {\sc np}s were collapsed into {\sc pp}s. They reported F$_{\beta=1}$ score of 92.3\% for {\sc np}s, 91.8\% for {\sc vp}s, 66.7\% for {\sc ap} chunks, 77.9\% for {\sc advp} chunks, and 96.1\% for prepositions. For {\sc pp} chunks, the F$_{\beta=1}$ score was 92\%. 

\citet{Brants99} presented a method for partial parsing that uses cascades of Markov Models to generate structural elements in a layer-by-layer fashion. The algorithm generates the internal structure of {\sc np} and {\sc pp} chunks including {\sc ap}s and {\sc advp}s, and other pre-modifiers. Sequences of words divided sentence by sentence served as input and the output was the PoS and chunked text. The algorithm was tested on 300k words taken from the {\sc negra} corpus consisting of German newspaper texts. Recall was 54\% for 1 layer and 84.8\% for 9 layers; precision was 91.4\% for 1 layer and 88.3\% for 9 layers. As Brants points out, these results are not directly comparable to previous studies because his study was performed on a different language than English (namely German) and his algorithm labeled the internal phrases within the {\sc np} and {\sc pp} chunks.
      
\citet{Osborne00} used a maximum entropy-based PoS tagger, {\sc mxpost} \citep{Ratna96}, without modifying the PoS tagger's internal operation, thus treating chunking as part-of-speech tagging, with an accuracy of 94.88\% and an overall F$ _{\beta=1}$ score of 91.94\%. The study was a part of a competition for the chunking approach at the 4th Conference on Computational Natural Language Learning (CoNLL-2000) which supplied the tag set, including the training and test data taken from the Wall Street Journal corpus. The training data consisted of 211,727 tokens and the test data of 47,377 tokens. 
The types of chunks used in the competition are described by \citet{Tjong00} and include ``base phrase categories'': noun phrases {\sc (np)} to the nominal head, verb clusters {\sc (vp)}, adjective phrases {\sc (adjp)}, adverb phrases {\sc (advp)}, prepositions {\sc (pp)} without {\sc np}s, compound conjunctions, verbal particles, interjections, list markers and conjunctions. 

The goal of the studies presented above was mainly to identify base phrase categories. Next, we will describe our method to build data-driven shallow parsers representing general phrasing including, among others, whole noun phrases with right-side arguments. 


\section{Building Shallow Parsers}
\label{building}
Four different aspects need to be addressed in order to build a data-driven shallow parser; the choice and the representation of the target classes that the algorithms have to learn to predict, the data used for training and test, the choice of algorithm(s), and the attributes or features included in learning. In the following sections, these aspects will be described.  

\subsection{Phrase Structure Representation}
\label{ph}
As we have seen in Section \ref{pw}, in previous studies (with the exception of the work presented by \citeauthor{Brants99}, \citeyear{Brants99}), the internal structure of the chunks is not analyzed. Only categories on higher nodes of the constituent structure are represented. For example, if a token/word belongs to an adjective phrase which in turn belongs to a noun phrase, the token is labeled with the noun phrase constituent only, not marking any other lower nodes in the tree. Leaving out the lowest constituents the token belongs to can have drawbacks for several applications, for example in dialog systems or text-to-speech systems, where information about the whole constituent structure can be important for better system performance. Therefore, the representation of the whole phrasal hierarchy containing information on all phrases is desirable.

Additionally, previous studies represent only partially linguistically motivated phrasal categories. Some phrase structures are not fully represented. For example, noun phrases are marked as far as the head noun only, hence the arguments on the right side of the noun head are missing. Also, prepositional phrases in many studies do not include any noun phrase. Furthermore, some PoS categories are treated as phrases, as in the CoNLL-2000 competition on chunking, where conjunctions constitute a conjunction phrase and interjections an interjection phrase. 

To be able to represent the whole hierarchical phrase structure, nine types of phrases are used. Some categories correspond to the chunks used in previous studies, for example {\sc ap}, {\sc advp}, and verb clusters. Other categories are designed to be able to handle arguments on the right hand side of the phrasal head and represent maximal projections, such as the maximal noun phrase label. Some categories are included to handle co-ordinated phrases, such as the maximal adjective phrase label. The phrase categories are listed below, each followed by a brief explanation and an example.

\begin{itemize}
\item Adverb Phrase {\sc (advp)} consists of adverbs that can modify adjectives or numerical expressions. 
\\ e.g.\ very 
\item Minimal Adjective Phrase {\sc (ap)} constitutes the adjectival head and its possible modifiers, e.g.\ {\sc advp} and/or prepositional phrase.\\e.g.\ very interesting
\item Maximal Adjective Phrase {\sc (apmax)} includes more than one {\sc ap} with a delimiter or a conjunction in between.\\e.g.\ very interesting and nice
\item Numerical Expression {\sc (nump)} consists of numerals with their possible modifiers, for example {\sc ap} or {\sc advp}.\\e.g.\ several thousands 
\item Noun Phrase {\sc (np)} may include the head noun and its modifiers to the left, e.g.\ determiners, nouns in genitive, possessive pronouns, numerical expressions, {\sc ap}, {\sc apmax} and/or compound nouns. Thus, possessive expressions do not split an {\sc np} into two noun phrases as in the CoNLL-2000 shared task on chunking.\\e.g.\ Pilger's very interesting and nice book  
\item Maximal Projection of {\sc np} {\sc (npmax)} includes one or more {\sc np}(s) with following {\sc pp}(s) as possible modifier.\\e.g.\ Pilger's very interesting and nice book about politics 
\item Prepositional Phrase {\sc (pp)} consists of one or several prepositions delimited by a conjunction and one or several {\sc np}s/{\sc npmax}s, or in elliptical expressions an {\sc ap} only.\\e.g.\ about politics   
\item Verb Cluster {\sc (vc)} consists of a continuous verb group belonging to the same verb phrase without any intervening constituents like {\sc np} or {\sc advp}.\\e.g.\ would have been
\item Infinitive Phrase {\sc (infp)} includes an infinite verb together with the infinite particle and may contain {\sc advp} and/or verbal particles.\\e.g.\ to go out 
\end{itemize}

Note that the grammatical categories represent neither clauses, such as relative clauses, nor sentences. These structures are planned to be analyzed in a later stage. 

\subsection{Training Data and Benchmark}
\label{data}
Swedish belongs to the Scandinavian, North Germanic family of the Germanic branch of Indo-European languages. It is morphologically richer than, for example, English. Nouns in general have a two-gender distinction. The genders are marked mainly by articles, adjectives, anaphoric pronouns, and in plural endings. As in English, nouns can appear with or without articles. There are, however, definite and indefinite articles that agree with the head noun in gender, number and definiteness. Furthermore, adjectives have gender, definiteness and plurality markers. Thus, in a noun phrase, both articles and adjectives agree in number, gender and definiteness with the head noun. Also, compound nouns are frequent and productive. Verbs lack markers for person or number of the subject but retain tense including complex tense forms. From a syntactic point of view, Swedish has subject-verb-object order in independent declarative sentences, as well as in subordinate clauses, similar to English. However, in subordinate clauses the sentence adverbs normally precede the finite verb and the perfect auxiliary can be omitted.   

Unfortunately, correctly chunked/parsed texts are not available for Swedish. Therefore, a treebank was built to serve as training data and a benchmark corpus. For the treebank development, an Earley Parser, {\sc spark} \citep{Aycock98} was used together with a context-free grammar for Swedish developed by the author. 

The second version of the Stockholm-Ume{\aa} corpus \citep{Eje92} annotated with {\sc parole} tags served as input to the parser.\footnote{Thanks to Britt Hartmann at the Department of Linguistics, Stockholm University, Sweden for making the second version of the Stockholm-Ume{\aa} corpus with {\sc parole} tags available.} The corpus is balanced, consisting of over one million PoS tagged tokens taken from different text genres in Swedish. The tag set consists of 146 tags including PoS categories and morphological features. 
The PoS tagged texts were parsed by {\sc spark} using the nine phrase categories that were described in Section \ref{ph}.

Each phrase type is represented with an additional tag marking position information in a manner similar to that proposed by \citet{Ramshaw95} and used in the CoNLL-2000 competition:
\bigskip\\
\indent XB -- the initial word of the phrase X \\
\indent XI -- non-initial word inside the phrase X \\
\indent O -- word outside of any phrase. 
\bigskip \\
Thus, each word and punctuation mark in a sentence is accompanied by a tag which indicates the phrase structure the token belongs to in the parse tree together with the position information. Since a token may belong to several phrases, it can have several tags. 

The representation is illustrated in the example below for the Swedish equivalent of the sentence ``Everybody should read Pilger's very good books about politics'' represented first by parenthesis notation, and second by PoS and phrase tags.
\newpage
{\small \indent [NP Alla NP] [VC borde l\"{a}sa VC] [NPMAX [NP Pilgers [AP [ADVP mycket ADVP] bra AP] \\ \indent b\"{o}cker NP] [PP om [NP politik NP] PP] NPMAX].}\\

\setlength{\tabcolsep}{5pt}
\begin{table}[h]
\begin{center}
\begin{tabular}{llll}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{Word} & {PoS + morphology} & {Phrase tags} & Translation\\
& as {\sc parole} tags & &\\
\noalign{\smallskip}
\hline 
\noalign{\smallskip}
Alla & PI@0P0@S & NPB & {\small{(everybody)}} \\
borde & V@IIAS & VCB & {\small{(should)}}\\
l\"{a}sa & V@N0AS & VCI	& {\small{(read)}} \\
Pilgers & NP00G@0S & NPB\_NPMAXB & {\small{(Pilger's)}}\\
mycket & RGPS & ADVPB\_APB\_NPI\_NPMAXI & {\small{(very)}}\\
bra & AQP00N0S & API\_NPI\_NPMAXI & {\small{(good)}}\\
b\"{o}cker & NCUPN@IS & NPI\_NPMAXI & {\small{(books)}}\\
om & SPS & PPB\_NPMAXI & {\small{(about)}}\\ 
politik & NCUSN@IS & NPB\_PPI\_NPMAXI & {\small{(politics)}}\\ 
. & FE & 0\\
\noalign{\smallskip}
\hline
\end{tabular}
\end{center}
\end{table}
The label for a word forms a hierarchical grouping of the parts of the sentence into constituents where lower nodes are situated nearest the word and higher nodes are farthest out. The advantage of the hierarchical annotation on the phrase level is that the user can choose the level of analysis by skipping phrase categories on lower, or higher nodes. For example, the user may only want to use noun phrase extraction without any information on the constituents inside the noun phrase, or to get a full analysis of every large phrase in the sentence. This type of annotation can be used in many different applications. The question is how well the data-driven PoS taggers can learn the hierarchical phrasal structure. 

The parsed text, annotated with the hierarchical constituent structure serves as training data and benchmark corpus for the experiments. {\sc spark} introduced some errors in both the training and benchmark. The error rate is estimated between 6\% and 11\% with 98\% confidence, and was determined by calculating the errors on a sample of 2,450 tokens in the training and test sets respectively. 60\% of the errors are due to {\sc pp} attachment problem in maximal projections of {\sc np}s, which is considered to be difficult even for human annotators. About 25\% of the noise is due to wrong position information of the {\sc np}. The rest of the errors can be found mainly in connection to adjective phrases. As manual post-processing to eliminate the noise was found to be prohibitively time-consuming, these errors have not been corrected.

After this description of the representation of the data, a brief overview of the algorithms, each with implementations for the PoS tagging approach that the parsers are built on, follows.

\subsection{Algorithms and Implementations}
\label{algorithms}
 The shallow parsers are based on three state-of-the-art data-driven algorithms that have implementations for the PoS tagging approach. Common to these taggers are their language and tag set independence, their free availability for research and their successful usage for several languages. The taggers that will be used to parse Swedish in this study are:  {\sc fntbl} \citep{Ngai01} which is a fast version of Brill's tagger based on transformation-based learning \citep{Brill94}, {\sc mxpost}, based on the maximum entropy framework \citep{Ratna96}, and lastly, Trigrams'n'Tags ({\sc tnt}) based on a Hidden Markov Model \citep{Brants00}. 
 
{\sc fntbl}, developed by \citet{Ngai01}, is a fast version of Brill's transformation-based learning algorithm\footnote{The difference between Brill's original and Ngai \& Florian's implementation ({\sc fntbl}) is that the latter stores the rules in memory instead of regenerating the rules at each step of the learning process, and the rules are only generated for the examples that change. A detailed description can be found in \citet{Ngai01}.}. 
It is a rule-based approach that learns by detecting errors. It begins with an unannotated text that is labeled by an initial-state annotator in a heuristic fashion. Known words (according to some lexicon) are annotated with their most frequent tag while unknown words receive an initial tag (for example the most frequently occurring tag in the corpus). Then, an ordered list of rules learned during training is applied deterministically to change the tags of the words according to their contexts. Unknown words are first assumed to be nouns and handled by prefix and suffix analysis by looking at the first/last one to four letters, capitalization feature and adjacent word co-occurrence. For the disambiguation of known words, the system uses a context of up to three preceding and following words and/or tags of the focus word as default. 

{\sc mxpost}, developed by \citet{Ratna96}, is a probabilistic classification-based approach based on a maximum entropy model where contextual information is represented as binary features that are used simultaneously in order to predict the PoS tags. The binary features used by {\sc mxpost} as default include the current word, the following and preceding two words and the preceding one or two tags.  For rare and unknown words the first and last four characters are included in the features, as well as information about whether the word contains uppercase characters, hyphens or numbers. The tagger uses a beam search in order to find the most probable sequence of tags. The tag sequence with the highest probability is chosen.
 
{\sc Trigrams'n'Tags ({\sc tnt})}, developed by \citet{Brants00}, is a statistical approach, based on a hidden Markov model that uses the Viterbi algorithm with beam search for fast processing. The states represent tags and the transition probabilities depend on pairs of tags. The system uses maximum likelihood probabilities derived from the relative frequencies. The main smoothing technique implemented as default is linear interpolation. The system uses a context of three tags. Unknown words are handled by suffix analysis up to the last ten characters of the word. Additionally, information about capitalization is included as default.

For the experiments, all systems are used with the default settings according to their documentation and were trained on the Swedish data described in Section \ref{data}.

\section{Experiments on Various Linguistic Features in Learning}
\label{features}
In previous studies on chunking, different types of linguistic information was used in training in order to find the correct chunk structure of the sentence. \citet{Ramshaw95} used lexical and/or PoS information, \citet{Arg98} and \citet{Cardie98} induced learning on the basis of PoS sequences without including any lexical information, while \citet{Brants99} entirely relied on words to be able to recognize both the PoS tags and the chunks. 
Comparing the results of these studies, we can see that the average accuracy is reported to be lowest when training is performed on PoS sequences only. 
However, it is difficult to compare the results because either the learning algorithm, the data set or the language vary across the studies. Therefore, it is of particular interest to train, test and compare the taggers on different types of data sets containing various linguistic features, using the same training and test set for reliable evaluation. 

In order to ascertain how well different data-driven PoS taggers can learn the whole hierarchical constituent structure of the word sequences, and to examine what effect different kinds of linguistic information included in the training data have on the taggers, four experiments are carried out. 
Each tagger is trained on four types of data set, each including different types of linguistic information, as is shown in Table~\ref{ling}. First, the training is performed on the basis of the word only---lexical information---to predict the PoS tag and the phrase tags. Second, similarly to the first case, training is performed on the word sequences to predict the phrase tags without PoS information. Third, the training is based on the word together with its PoS to predict the phrase labels. Lastly, the words are removed from the training data, and only the PoS tags of the words are trained with phrase labels. In this way, all combinations of possible types (word and/or PoS) and possible target classes (phrases with or without PoS) are examined. 

\setlength{\tabcolsep}{4pt}
\begin{table}[h]
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|l}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{\sc types to learn from} & {\sc target classes} \\
\noalign{\smallskip}
\hline 
\noalign{\smallskip}
Words & PoS + Phrases \\
Words & Phrases \\
Words + PoS & Phrases \\
PoS & Phrases\\
\noalign{\smallskip}
\hline
\end{tabular}
\caption{Combinations of the linguistic features in learning.\label{ling}}
\end{center}
\end{table}

In order to examine how the size of the training set influences the performance of the classifiers, each tagger is trained in each experiment nine times on the subsets of same data set of various sizes from one thousand to five hundred thousand tokens: 1k, 2k, 5k, 10k, 20k, 50k, 100k, 200k, and 500k tokens respectively. Then, the same test set, consisting of 117,530 tokens, is annotated by each classifier. In each experiment, the training and test sets are disjoint. 


\section{Results}
In this section, the results from the four learning tasks as described in Section~\ref{features} are presented. In each experiment, the evaluation is based on the widely used measure {\em accuracy}, which is obtained by dividing the number of correctly labeled tokens with the total number of tokens, see Equation~\ref{accur}. A correct parse requires complete and correct phrase labels for a token including the position information ({\sc bio} tag). If the word would lack a label for a phrase that it is part of, or if a phrase label would have wrong position information then the whole tag is considered to be incorrect.   
 
\begin{equation}
\label{accur}
Accuracy = \frac{\#\,of\,correctly\,tagged\,tokens}{\#\,of\,tokens}
\smallskip
\end{equation}

In some cases, the performance of the classifiers is also measured with precision, recall, and F$_{\beta=1}$ rates for each single phrase type given by the hierarchical annotation. Each phrase type is extracted from the concatenated phrase label and counted as described below in Equations~\ref{prec}, \ref{rec} and \ref{fb}.  

\begin{equation}
\label{prec}
Precision = \frac{\#\,of\,correctly\,tagged\,tokens\,as\,phrase\,type\,X}{\#\,of\,detected\,tokens\,as\,phrase\,type\,X}} 
\smallskip
\end{equation}

\begin{equation}
\label{rec}
Recall = \frac{\#\,of\,correctly\,tagged\,tokens\,as\,phrase\,type\,X}{\#\,of\,tokens\,as\,phrase\,type\,X}
\smallskip
\end{equation}

\begin{equation}
\label{fb}
F_{\beta=1} = \frac{(\beta^2+1)\,*\,Precision\,*\,Recall}{\beta^2\,*\,Precision+Recall}
\bigskip
\end{equation}


Before we go into details about the results, several aspects that might influence the performance of the classifiers have to be considered. One of these concerns the number of target classes the learners learn to predict in the different experiments when training is performed on various sizes of data sets with different linguistic features involved in learning. Due to the the hierarchical annotation, the number of possible combinations of phrase types lies between 260 and 3100 classes, depending on the size and the type of the training data. The relationship between the size of the training set and the number of classes that the learners search through to predict the phrase tags with and without PoS information is shown in Figure~\ref{class}.  We can see that the number of target classes increases with the size of the training set, as well as when the prediction of PoS tags together with phrase tags are required by the learners. 


\begin{figure}[h]
\samepage
\centerline{\epsfig{figure=classes.ps,width=100mm}}
\caption{The number of target classes in training data of various size.\label{class}}
\end{figure}

Another aspect that might have an influence on the performance concerns the number of token types appearing in the training data, i.e.\ the number of different lexical token types, part-of-speech tags or a combination of these. Figure~\ref{tokens} shows, not surprisingly, that the number of token types increases with the size of the training set. The increase is largest when lexical information serves as a basis in the learning process, and lowest when training is performed on PoS sequences only without the presence of the words due to the low number of PoS tags; The total number of PoS tags lies between 82 and 143 depending on the size of the training set. It is also worth noting that the number of types is somewhat higher when both lexical and PoS information is included in the training to learn the phrase categories, compared to when only the words are present. The reason is, naturally, that there are no homonyms because of the presence of the PoS tags attached to each token.    


\begin{figure}[h]
\samepage
\centerline{\epsfig{figure=tokentypes.ps,width=100mm}}
\caption{The number of token types in training data of various size.\label{tokens}}
\end{figure}

The percentage of token types that can have more than one target class in the training data is also of interest since the algorithms have to learn to choose the correct class among the possible ones given a certain context. Figure~\ref{ambi} shows the percentage of ambiguous token types in the training data of various size for the four types of learning experiments. Between 60\% and 85\% of the PoS types are ambiguous while for the lexical types including words with or without PoS the percentage of ambiguous types is significantly lower. However, when the target class constitutes PoS and phrase structure, the number of ambiguous token types is higher than it is when the target class contains phrase labels alone.  


\begin{figure}[h]
\samepage
\centerline{\epsfig{figure=ambigtokentypes.ps,width=100mm}}
\caption{The percentage of ambiguous token types in training data of various size.\label{ambi}}
\end{figure}

The percentage of unknown tokens is also of interest since the classification task becomes harder when the test set includes a large number of unknown tokens, tokens that are not present in the training data. Figure~\ref{unknown} illustrates the percentage of unknown tokens in the test set compared to the training sets of different sizes for the various learning tasks. Not surprisingly, the number of unknown tokens is very small or zero when training is performed on PoS sequences only since the majority of PoS tags appears in the training data. 
The largest number of unknown tokens is found when learning is based on lexical and PoS information on smaller training corpora containing up to 100k tokens. On the other hand, if large training set is used, containing both lexical and PoS information, the number of unknown words decreases compared to when training is performed on lexical information only.  


\begin{figure}[h]
\centerline{\epsfig{figure=unknowntokens.ps,width=100mm}}
\caption{The percentage of unknown tokens in the test data compared to the training data of various size.\label{unknown}}
\end{figure}

\begin{figure}[h]
\centerline{\epsfig{figure=baseline-perf,width=100mm}}
\caption{Baseline performance for each experiment for training data of various sizes.\label{baseline}}
\end{figure}


Lastly, in order to evaluate the effectiveness of the classifiers for the four learning tasks, baseline performance is relevant since it describes a minimal performance rate that each classifier should achieve. Baseline values have therefore been obtained for the test data of the four types of learning tasks. The baseline is counted in different ways depending on the input the learners get and the class they have to learn to predict. Each known token in the test data receives a class label (i.e.\ either PoS + Phrases, or Phrases) that is most frequently associated with that token type in the training data. Tokens not in the training data are treated as wrongly annotated. In Figure~\ref{baseline}, the results are shown for the training data of various sizes within each experiment type. On average, baseline performance is lowest when lexical information is involved in training. When PoS categories are also included in the training set, baseline performance increases. We can also notice that the size of the training set influences the accuracy; when training is performed on large training corpora, the baseline accuracies for the four types of training sets become more even.  


With these prerequisites in mind, the results given by the classifiers for each learning task will be described. 


\subsection{Performance of the Classifiers}
\label{perfclass}
To present an overall picture of the parsers' performance, the accuracy of each classifier, when training is performed on 200k tokens, is listed in Table~\ref{acc}. Performance measures for known and unknown tokens are also listed separately. The performance of the classifiers varies depending on what type of information is included in the training data. The best average performance of all three parsers is achieved when only PoS information constitutes the input to the classifiers. When PoS information is not present in learning, the accuracy of all algorithms drops markedly.

\setlength{\tabcolsep}{2pt}
\begin{table}[h]
\begin{center}
\noalign{\smallskip}
\begin{tabular}{lr|lr|ccc|ccc|ccc}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{\sc types} & number & {\sc classes} & number & \multicolumn{3}{c|}{\sc fntbl} & \multicolumn{3}{c|}{\sc mxpost} & \multicolumn{3}{c}{\sc tnt}  \\
\cline{5-13}
& & & & \multicolumn{1}{c}{\sc t} & \multicolumn{1}{c}{\sc k} & \multicolumn{1}{c|}{\sc u} & \multicolumn{1}{c}{\sc t} &\multicolumn{1}{c}{\sc k} & \multicolumn{1}{c|}{\sc u} &  \multicolumn{1}{c}{\sc t} & \multicolumn{1}{c}{\sc k} &\multicolumn{1}{c}{\sc u} \\
\hline 
Words & 35,611 & PoS + Phrases & 2,492 & 72.8 & 77.6 & 36.9 & {\bf 77.9} & {\em 80.1} & {\em 61.2} & 72.2 & 75.0 & 51.9 \\
Words & 35,611 & Phrases & 534  & 75.1 & 77.9 & 54.4 & {\bf 81.7} & {\em 83.0} & {\em 72.0} & 72.8 & 75.0 & 56.3 \\
Words + PoS & 37,870 & Phrases & 534 & 83.3 & 83.5 & 82.4 & {\bf 87.9} & {\em 88.4} & {\em 84.3} & 79.9 & 80.0 & 79.5 \\
PoS & 141 & Phrases & 534 & {\bf 94.8} & {\em 94.8} & {\em 70.4} & 90.0 & 90.0 & 20.0 & 92.0 & 92.1 & 40.0 \\
\noalign{\smallskip}
\hline
\end{tabular}
\caption{The results are given for each classifier when trained on 200k tokens on the four types of input, and tested on 117,530 tokens. Accuracy (\%) is calculated for the total number of tokens ({\sc t}), as well as for known ({\sc k}) and unknown ({\sc u}) tokens.\label{acc}}
\end{center}
\end{table}
The transformation-based learner, {\sc fntbl}, achieves best performance when only PoS information is included in training, while in the other experiments, the maximum entropy tagger, {\sc mxpost}, obtains highest accuracy. However, when training is performed on lexical sequences only, {\sc tnt} obtains better results for the annotation of unknown tokens, than {\sc fntbl} does.   

The nine phrase types are also evaluated separately by extracting each phrase type from the concatenated tags, i.e.\ by not considering the correctness of the phrasal categories on lower and/or higher nodes in the tree in the evaluation process. Precision, recall and F$_{\beta = 1}$ rates are measured for the phrase types given by the parsers trained on 200k tokens on various types of input features, and tested on 117,530 tokens. The F$_{\beta = 1}$ scores are given in Table~\ref{FB1}, and the complete set of values are listed in the Appendix in Table~\ref{PRF}. Highest scores for a phrase type are printed in bold, while the highest values for the various learning types are italicized. 

\begin{table}[h]
\setlength{\tabcolsep}{2pt}
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|l|r|r|r|r|r|r|r|r|r}
\hline
\multicolumn{1}{c|}{Feature} & \multicolumn{1}{c|}{Class} & \multicolumn{1}{c|}{{\sc advp}} & \multicolumn{1}{c|}{{\sc ap}} & \multicolumn{1}{c|}{{\sc\small apmax}} & \multicolumn{1}{c|}{{\sc infp}} & \multicolumn{1}{c|}{{\sc np}} & \multicolumn{1}{c|}{{\sc\small npmax}} & \multicolumn{1}{c|}{{\sc nump}} & \multicolumn{1}{c|}{{\sc pp}} & \multicolumn{1}{c}{{\sc vc}} \\
\hline
& \multicolumn{1}{c|}{Total} & 5,970 & 10,477 & 1,433 & 2,541 & 53,810 & 24,350 & 1,951 & 29,419 & 16,282 \\
\noalign{\smallskip}
\hline
\hline
 & {\sc fntbl} & {\em 83.5} & 74.0 & 14.4 & 87.4 & 94.2 & 77.3 & 90.1 & 85.0 & 93.5 \\
W \rightarrow PoS+Ph & {\sc mxpost} & 81.2 & 78.3 & 24.8 & {\em 93.8} & 95.7 & {\em 79.4} & 89.5 & {\em 88.4} & 96.1 \\
 &{\sc t}n{\sc t} & {\em 83.5} & {\em 80.3} & {\em 27.1} & 88.9 & {\em 96.3} & 66.8 & {\em 91.9} & 79.3 & {\em 96.8} \\
\hline 
 & {\sc fntbl} & {\em 83.0} & 74.3 & 22.5 & 86.4 & 94.2 & 74.4 & 91.0 & 83.0 & 93.5 \\
W \rightarrow Ph & {\sc mxpost} & 80.9 & 79.1 & {\em 28.8} & {\em 89.9} & {\em 95.5} & {\em 80.0} & 89.7 & {\em 88.7} & 95.7 \\
 &{\sc t}n{\sc t} &  81.6 & {\em 79.2} & 27.9 & 87.3 & 95.5 & 66.3 & {\em 91.5} & 78.3 & {\em 96.0} \\
\hline
 & {\sc fntbl} & 99.3 & 86.0 & 41.2 & 93.9 & 97.4 & 81.0 & 95.6 & 87.9 & 99.2 \\
W+PoS \rightarrow Ph & {\sc mxpost} & 98.5 &  89.1 & {\em 47.2} & {\em 97.3} & {\em 98.3} & {\em 84.6} & 94.3 & {\em 90.9} & {\em 99.4} \\
 &{\sc t}n{\sc t} & {\bf 99.4} & {\em 89.3} & 43.0 & 91.8 & 98.0 & 73.1 & {\em 96.7} & 83.0 & 99.1 \\
\hline 
 & {\sc fntbl} & {\em 80.5} & {\bf 95.3} & {\bf 86.6} & {\bf 100.0} & {\bf 99.3} & {\bf 97.6} & 98.0 & {\bf 98.1} & {\bf 100.0} \\
PoS \rightarrow Ph & {\sc mxpost} & 77.7 & 91.9 & 75.1 & 99.0 & 98.7 & 87.2 & 96.9 & 93.3 & 99.9 \\
 & {\sc t}n{\sc t} & 76.8 & 93.9 & 78.0 & 98.3 & 98.9 & 95.7 & {\bf 98.4} & 96.7 & 99.8 \\
\hline
\end{tabular}
\caption{F$_{\beta = 1}$ rates for each classifier when trained on 200k tokens on the four types of input features, and tested on 117,530 tokens. The total number of occurrences for each phrase type in the benchmark is given in the second row.\label{FB1}}
\end{center}
\end{table}

On average, verb clusters ({\sc vc}) and infinitive phrases ({\sc infp}) are easiest to classify, followed by noun phrases ({\sc np}), prepositional phrases ({\sc pp}), and numerical expressions ({\sc nump}). Adjective phrases, especially the maximal projections of {\sc ap}s ({\sc apmax}), receive a surprisingly low F$_{\beta = 1}$, when lexical information is involved in the learning task. Most of the conjoined {\sc ap}s are not found by the classifiers at all---the recall values are exceptionally low---as shown in Table~\ref{PRF}. Maximal projections of noun phrases ({\sc npmax}) are also difficult to detect compared to other phrase types, even though the recall rates are considerably higher than for {\sc apmax}. The low recall values for the maximal projections in general could be the result of the biased training data and benchmark caused by the rule-based context-free parser. 

We can see that the best values for eight of the phrases are achieved when training is performed on PoS sequences only. Adverb phrases ({\sc advp}), on the other hand, are more often correctly detected when lexical information is included in learning. An explanation for this can be found in the annotation of adverbs in the SUC corpus where the discrimination of the adverbs is made on the basis of their morphological structure, rather than their syntactic characteristics. Thus, sentence adverbs do not belong to a distinct PoS category. For this reason these adverbs had to be listed in the rule-based parser in order to correctly detect the phrase structure. Therefore, the data-driven parsers, when trained on PoS sequences only, wrongly analyze adverb phrases, shown by the comparatively low precision. 

These results do not tell us about the algorithms' sensitivity to the size of the training set when different types of information are used in learning. One might surmise that the larger amount of data we use, the better performance we get. However, the improvement does not necessarily have to be the same for the algorithms when we train them on various input features. Next, the effect of the different linguistic information used in learning will be described.
 
\subsection{The Effect of the Linguistic Features}
\label{effectoffeatures}
The results for each experiment on the learners' sensitivity to the input feature sets (word, word and PoS, PoS only), and to the number of target classes (phrases with or without PoS), are shown in Figures \ref{w-poph}, \ref{w-ph}, \ref{wpo-ph} and \ref{po-ph}, respectively.
All systems in all four experiments outperform the baseline independently of the type of linguistic features involved in learning or of the size of the training set.

The first learning task, where training is performed on the basis of lexical information only, to predict the PoS together with the correct phrase labels ({\sc word} $\rightarrow$ {\sc P}o{\sc S} + {\sc phrases}), is the most difficult classification task for every algorithm (see Figure~\ref{w-poph}). This is not surprising since the systems have to learn a great number of classes, between 264 and 3099 tags, depending on the size of the training set. Thus, in this experiment, the hypothesis space that the algorithms have to search through is large. The classifiers here are treated as PoS taggers and parsers. {\sc tnt} has the lowest error rate when training is performed on small training sets consisting of up to 20k tokens while {\sc mxpost} outperforms {\sc tnt} when using 50k or more tokens for training. It is also worth noticing that {\sc fntbl} achieves higher performance than {\sc mxpost} when training is done on very small training sets because of {\sc fntbl}'s higher accuracy achieved for the annotation of known words. 

\begin{figure}[h]
\centerline{\epsfig{figure=w-posph-learcurve.ps,width=100mm}}
\caption{The error rate for each classifier when training is performed on the basis of lexical information to classify PoS and phrase structure information.\label{w-poph}}
\end{figure}

In the second learning task, where PoS information is not present in the training data, i.e.\ the training is performed entirely on lexical information ({\sc word} $\rightarrow$ {\sc phrases}), the hypothesis space becomes smaller than in the first experiment due to a decrease in the number of classes. The smaller tag set makes the classification task easier and average system performance increases (see Figure~\ref{w-ph}). Similarly to the first experiment, the maximum entropy approach, {\sc mxpost}, achieves the lowest error rate in cases in which the training corpus consists of more than 5k tokens. {\sc tnt} obtains the best result when the training set is small (up to 5k tokens), while {\sc fntbl} outperforms {\sc tnt} on large training sets (200k tokens or more). 

\begin{figure}[h]
\centerline{\epsfig{figure=w-ph-learcurve.ps,width=100mm}}
\caption{The error rate for each classifier when training is performed on the basis of lexical information to predict the phrase tags.\label{w-ph}}
\end{figure}

In the third learning task, where both lexical and PoS information is included as input features for the recognition of the phrasal structures ({\sc word + P}o{\sc S} $\rightarrow$ {\sc phrases}), the average performance of the classifiers further increases (see Figure~\ref{wpo-ph}). A possible explanation for the increase of the systems' performance can be that although in this experiment we find the largest number of token types, the problem of lexical homonymy is eliminated, since every token type becomes unique by the PoS tag attached to it. We thereby reduce the number of possible parse trees. Just as in the first two experiments, {\sc mxpost} has the lowest error rate when a large training set is used in learning (5k tokens or more), and {\sc tnt} succeeds well when learning is performed on small data sets (up to 5k tokens). {\sc fntbl} succeeds better than {\sc tnt} when large training sets serve as input (200k tokens or more), and has highest error rate when training is carried out on small corpora (up to 50k tokens). 

\begin{figure}[h]
\centerline{\epsfig{figure=wpos-ph-learcurve.ps,width=100mm}}
\caption{The error rate for each classifier when training is performed on the basis of lexical information together with the correct PoS to predict the phrase labels.\label{wpo-ph}}
\end{figure}


\begin{figure}[h]
\centerline{\epsfig{figure=pos-ph-learcurve.ps,width=100mm}}
\caption{The error rate for each classifier when training is performed entirely on the basis of the PoS to predict the phrase labels.\label{po-ph}}
\end{figure}

Lastly, in the fourth learning task, where lexical information is not present in training ({\sc P}o{\sc S} $\rightarrow$ {\sc phrases}), the performance of the systems increases greatly compared to cases in which lexical information is included in the training process. This can be explained by the low percentage of unknown tokens (PoS tags) in small training sets, and the absence of unknown tokens when a large training corpus is used (see Figure~\ref{unknown}). The baseline performance therefore increases and the learning curves of the classifiers converge (see Figure~\ref{po-ph}). {\sc fntbl} obtains the best accuracy on average compared to the statistical approaches {\sc tnt} and {\sc mxpost}. 

To summarize the effect of the linguistic information included in training, the best results is obtained by excluding all lexical information from the learning process. However, if the user would like to use lexical information, each token should be annotated with its PoS tag during learning, thereby eliminating homonyms, and achieving higher system performance. It is worth noting though that when using words and PoS as input, the taggers used see the input as an atom and cannot ignore the word itself. If the input features were separated from each other, the results would be different. Considering the results given when training is performed on lexical information alone, all systems perform better when recognizing the phrasal structure without the prediction of PoS tags. The statistical approaches can better learn from many different token types, while the transformation-based learner achieves highest accuracy in cases where a small number of token types is involved in learning.

Next, we will look at the effect that the size of the training set has on the three systems given the four learning tasks. 

\subsection{The Effect of the Size of the Training Set}
\label{size}
As we can see from Figures \ref{w-poph}, \ref{w-ph}, \ref{wpo-ph}, and \ref{po-ph}, accuracy is improved for all systems by increasing the size of the training corpus. The fact that the learning task becomes easier with a larger training corpus is not surprising, since as we increase the size of the training set, we increase the number of different contextual environments in which the token types (i.e.\ the PoS tag, the word, or both together) can appear, as well as decrease the percentage of unknown tokens, as was shown in Figure~\ref{unknown}.  It has to be pointed out, that an additional large lexicon listing all possible classes for a token type can be used in {\sc fntbl} and {\sc tnt} during learning in order to decrease the total number of unknown tokens and thereby increase system performance. However, such a lexicon was not used in this study. 

The systems show different sensitivity to the size of the training corpus in the various experiments. The maximum entropy approach, {\sc mxpost}, achieves lowest error rate when large training corpus containing lexical information is used. The hidden Markov model, {\sc tnt}, on the other hand, obtains fewest errors when trained on small data sets with lexical information included in training, and shows the lowest sensitivity to the size of the training set compared to the other approaches. The transformation-based approach, {\sc fntbl}, succeeds well in cases when large training sets are used, especially when PoS is included as a feature type.

The reason for the different sensitivity the systems show can possibly be explained by the type of background knowledge that is implemented in the systems. The success of {\sc tnt} when training is performed on small data sets might depend on the smoothing strategy incorporated for handling sparse data (in this study, linear interpolation is used when training {\sc tnt}), while the other systems do not use smoothing. The success of {\sc tnt} might also depend on the parameters implemented in the system for the annotation of unknown words ({\sc tnt} checks up to the last ten characters of a token while the other approaches use affix analysis up to four characters only). Both the number of token types appearing only once and the number of unknown tokens are high when using small training set. For example, 77\% and 86\% of token types appear only once when the training data consists of 20k and 1k tokens respectively; 51\%  and 20\% of the words are unknown in the test data given training sets including 1k and 50k tokens each (as shown in Figure~\ref{unknown}). 

The success of the maximum entropy approach, {\sc mxpost}, achieving lowest error rates when the training corpus is large and includes lexical information, can be explained by the window size the system uses for disambiguation. {\sc mxpost} looks at a larger window size, a context of two preceding tags, and two preceding and following tokens, while {\sc tnt} uses a context of three tags, only. 

The transformation-based learner, {\sc fntbl}, on the other hand, does not perform well on small training sets and obtains the highest error rate in the annotation of unknown words. When only PoS categories are used as the basis for learning, we eliminate the problem of the analysis of unknown words, thereby making the classification easier for {\sc fntbl}. However, this might not be the only reason for {\sc fntbl}'s success. The contextual environment that {\sc fntbl} uses for disambiguation to predict the phrasal structure of a particular PoS tag is largest among the PoS taggers. {\sc fntbl} uses a window size of up to seven tokens/tags, that is a context of up to three preceding and following tokens and/or tags. 

Thus, as we have seen, the type of linguistic information used in learning, and the size of training set are both important facts that we have to consider when building data-driven chunkers/parsers. However, the reader should keep in mind that the results presented above do not show differences between the algorithms {\em per se}, since the algorithms are trained with different parameters. Rather, the results only let us compare the implementations of the algorithms, i.e.\ the PoS taggers, that are used---as they are---with their default settings for the parsing task.   

\subsection{The Effect of Background Knowledge}
In the previous sections, clear differences were found between the parsers' performance for the various learning tasks. We hypothesized that the background knowledge the parsers use for the identification of unknown words and their disambiguation strategies may play an important role in the systems' performance. The question is whether the results obtained can be related to the properties of the algorithms {\em per se}, or to the parameters (such as suffix analysis or window size) used by the taggers.

In this section, we present a pilot investigation on how the parameters used in the implementation of algorithms might influence system performance. First, {\sc fntbl} is trained with the same parameter settings as {\sc mxpost} and {\sc tnt} use (see Section \ref{algorithms}). Training is carried out separately with regard to lexical parameters for the analysis of unknown tokens, contextual parameters for the disambiguation of known tokens, and the combination of both types, according to the taggers' ({\sc mxpost} and {\sc tnt}) settings. Other types of features, for example smoothing, was not implemented in {\sc fntbl} in order to simulate {\sc tnt}. Second, {\sc tnt} is trained with different smoothing methods and suffix analysis. For {\sc fntbl}, the change of parameters is straightforward and easily applicable while for {\sc mxpost} and {\sc tnt}, the source code is not included in the releases. Therefore, re-implementations of these algorithms would be necessary in order to be able to include the same parameters in each system.

In this experiment, training is performed on a small, medium, and large data set---the same as was used in the previous experiments---consisting of 2k, 20k, and 200k tokens respectively on the four types of training sets with different types of linguistic information included in learning. The test set is the same as in the experiments described above. 

The results for {\sc fntbl} using the parameter settings of {\sc mxpost} and {\sc tnt} are shown in Table~\ref{tblasmxpost} and Table~\ref{tblastnt} respectively when the four types of data sets of various sizes served as training data. In the second and the last columns, the parsers' original performance is shown (reported in Section~\ref{size}) while in columns three, four and five, the results given by the simulation experiment are listed. Accuracy is also shown for known and unknown words separately in the Appendix in Table~\ref{simul}.  

Considering the results, the original implementations of the systems have highest overall performance in all training experiments. When {\sc fntbl} is trained with the same lexical and contextual parameters as {\sc mxpost} (see column 3 in Table~\ref{tblasmxpost}) with words included in learning, accuracy drops markedly compared to {\sc mxpost}'s original performance. On the other hand, when training is performed on PoS sequences only, {\sc fntbl}'s accuracy somewhat increases compared to {\sc mxpost}'s original results. The results indicate differences between the algorithm bias; {\sc fntbl} has the advantage of learning on the basis of a few token types while {\sc mxpost} learns best when a large number of types is included in the training data. However, {\sc fntbl}'s original performance and the results when it is trained with {\sc mxpost}'s lexical and its own contextual parameters (see column 4) are directly comparable. This is due to the fact that the same parameters are used by both {\sc fntbl} and {\sc mxpost} for affix analysis. However, when only {\sc mxpost}'s contextual parameters are used (see column 5), thereby decreasing the window size of {\sc fntbl} from seven to five, accuracy decreases for large training sets because of the higher error rate of the annotation of known tokens (see Table~\ref{simul}) while performance increases somewhat for small data sets. 

\setlength{\tabcolsep}{1.5pt}
\begin{table}[h]
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|rrr|rrr|rrr|rrr|rrr}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{Accuracy (\%)} & \multicolumn{3}{c|}{{\sc mxpost}} & \multicolumn{3}{c|}{\sc \small mxpost-lex} & \multicolumn{3}{c|}{\sc \small mxpost-lex}& \multicolumn{3}{c|}{{\sc \small fntbl-lex}} & \multicolumn{3}{c}{{\sc fntbl}}\\
 & \multicolumn{3}{c|}{\sc original} & \multicolumn{3}{c|}{{\sc \small mxpost-con}} & \multicolumn{3}{c|}{{\sc \small fntbl-con}} &\multicolumn{3}{c|}{\sc \small mxpost-con} & \multicolumn{3}{c}{{\sc original}}\\
\noalign{\smallskip}
\hline
{Information} 
&\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} &\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c}{\small 200k} \\
\hline 
\noalign{\smallskip}
Word \rightarrow PoS+Ph & 40.7 & 62.9&77.9&43.2&58.1&70.9&43.0&59.0&72.7&42.5&58.6&70.7&44.3&58.5&72.8\\
Word \rightarrow Ph &50.6 &72.5&81.7&47.8&64.0&73.1&47.9&64.5&72.2&48.9&63.4&73.7&48.5&64.4&75.1\\ 
Word+PoS \rightarrow Ph &71.3 &84.6&87.9&60.0&74.7&80.9&60.5&75.5&83.2&63.4&74.5&81.8&60.8&76.2&83.3\\
PoS \rightarrow Ph & 75.5&88.4&90.0&76.6&87.4&91.7&78.3&89.4&94.8&76.6&87.4&91.7&78.6&89.4&94.8\\
\hline
\end{tabular}
\caption{Accuracy (\%) is shown for {\sc fntbl} when trained with the same parameters---either lexical (column 4), contextual (column 5), or both (column 3)---as {\sc mxpost}.\label{tblasmxpost}}
\end{center}
\end{table}

The accuracy rates for {\sc fntbl} trained by using {\sc tnt}'s lexical and/or contextual parameters are shown in column 3, 4 and 5 in Table~\ref{tblastnt}. Here, as was the case in the previous example, the original implementation of {\sc tnt} achieves higher performance than {\sc fntbl} using {\sc tnt}'s lexical and contextual parameter settings (see column 2 and 3). The deviance is largest for small training sets, and the error rate on average is highest for the annotation of unknown words (see Table~\ref{simul}) indicating that {\sc tnt} has a better method for analyzing unknown tokens. By extending {\sc fntbl}'s lexical templates from four to up to ten characters while keeping the same window size (see column 4), performance of {\sc fntbl} is directly comparable to {\sc fntbl}'s original results. However, when we decrease the window size instead from seven to three tags (see column 5), performance decreases as the size of the training set increases due to the high error rate for the annotation of known tokens, as is shown in Table~\ref{simul}.     

\setlength{\tabcolsep}{1.5pt}
\begin{table}[h]
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|rrr|rrr|rrr|rrr|rrr}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{Accuracy (\%)} & \multicolumn{3}{c|}{{\sc tnt}} & \multicolumn{3}{c|}{\sc \small tnt-lex} & \multicolumn{3}{c|}{\sc \small tnt-lex}& \multicolumn{3}{c|}{{\sc \small fntbl-lex}} & \multicolumn{3}{c}{{\sc fntbl}}\\
 & \multicolumn{3}{c|}{\sc original} & \multicolumn{3}{c|}{{\sc \small tnt-con}} & \multicolumn{3}{c|}{\small {\sc \small fntbl-con}} &\multicolumn{3}{c|}{\sc \small tnt-con} & \multicolumn{3}{c}{{\sc original}}\\
\noalign{\smallskip}
\hline
{Information} 
&\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} &\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c}{\small 200k} \\
\hline 
\noalign{\smallskip}
Word \rightarrow PoS+Ph & 49.5&63.2&72.2&43.5&56.1&64.7&44.5&58.9&72.4&43.1&56.4&64.5&44.3&58.5&72.8\\
Word \rightarrow Ph &57.9&66.9&72.8&48.5&61.8&71.4&48.1&63.9&75.1&48.3&60.9&71.4&48.5&64.4&75.1\\ 
Word+PoS \rightarrow Ph &73.1 &78.5&79.9&62.5&73.0&77.0&60.8&76.2&81.3&61.9&72.8&78.1&60.8&76.2&83.3\\
PoS \rightarrow Ph & 76.6&88.1&92.0&73.5&81.1&83.4&78.2&89.4&94.8&73.6&81.1&83.4&78.6&89.4&94.8\\
\hline
\end{tabular}
\caption{Accuracy (\%) is shown for {\sc fntbl} when trained with the same parameters---either lexical (column 4), contextual (column 5), or both (column 3)---as {\sc tnt}.\label{tblastnt}}
\end{center}
\end{table}

Lastly, previously we hypothesized that the reason for {\sc tnt}'s success on small corpora might be the smoothing strategy for sparse data and/or the suffix analysis of up to ten characters for unknown words. In this experiment, {\sc tnt} is trained without any smoothing used in learning, as well as a lexical parameter setting using four character analysis only. The results are summarized in Table~\ref{tntbi}. It is obvious that when training is performed without any smoothing, accuracy decreases to a great extent. Without smoothing involved in learning, the other approaches would outperform {\sc tnt} in the experiments described in previous sections. We can also notice that, in contrast to what was expected, a suffix analysis of up to four characters gives higher performance on average than that of ten letters.    

\setlength{\tabcolsep}{3pt}
\begin{table}[h]
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|rrr|rrr|rrr|rrr}
\noalign{\smallskip}
\hline
\noalign{\smallskip}
{Accuracy (\%)} & \multicolumn{3}{c|}{{\sc tnt}} & \multicolumn{3}{c|}{\sc \small no smoothing} & \multicolumn{3}{c|}{\sc \small no smoothing}& \multicolumn{3}{c}{{\sc \small smoothing}}\\
& \multicolumn{3}{c|}{\sc original} & \multicolumn{3}{c|}{{\sc \small 10 letters}} & \multicolumn{3}{c|}{\sc \small 4 letters} &\multicolumn{3}{c}{\sc \small 4 letters} \\
\noalign{\smallskip}
\hline
{Information} 
&\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} &\multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c|}{\small 200k} & \multicolumn{1}{c}{\small 2k} & \multicolumn{1}{c}{\small 20k} & \multicolumn{1}{c}{\small 200k} \\
\hline 
\noalign{\smallskip}
Word \rightarrow PoS+Ph & 49.5&63.2&72.2&30.1&43.3&52.1&35.1&43.2&52.4&49.9&65.0&73.7\\
Word \rightarrow Ph &57.9&66.9&72.8&49.6&58.4&64.6&50.0&59.5&65.5&58.7&68.9&74.3	\\
Word+PoS \rightarrow Ph &73.1&78.5&79.9&60.5&68.3&71.5&61.9&70.0&70.6&76.7&78.9&79.8	\\
PoS \rightarrow Ph &76.6&88.1&92.0&64.1&83.9&91.6&64.1&83.9&91.6&76.6&88.2&92.1 \\
\hline
\end{tabular}
\caption{Accuracy (\%) is shown for {\sc tnt} when trained  on 2k, 20k, and 200k tokens using suffix analysis of 4 and 10 characters with and without linear interpolation as smoothing strategy.\label{tntbi}}
\end{center}
\end{table}

The results indicate that both the algorithm bias and the parameter settings used in learning play an important role in system performance. However, further investigation is necessary to find out the relationship between algorithm and information bias, as was pointed out by \citet{dePauw00}.
Next, we will describe how to use the results, reported in this paper, in real-world applications in an efficient way.  

\subsection{Evaluation for Real-World Applications}

The reader might ask how we can apply the results described in this paper in real-world applications in which the system needs both PoS tagged and parsed text. An obvious solution is to let the best PoS tagger available for the particular language or domain annotate the text to be analyzed. The next step would be to extract the PoS labels from the text but keep the sentence division, and let the parser annotate the PoS sequences. The only thing then remaining to do would be to put the words back into the parsed PoS sequences. 

Obviously, if the user does not have the text annotated with correct PoS tags but has to use a tagger for that purpose, the performance of the parser can be expected to decrease. Therefore, an evaluation for real-world applications appears to be necessary. 

Since the results described above show that the most successful parsing classification is achieved by training on PoS categories only to predict the constituent structure of a token, the parsers which were trained entirely on PoS information were used for the chosen real-world evaluation task. First, the unannotated test data was tagged by a PoS tagger. For that purpose, {\sc tnt} is used since this tagger has been proved to achieve highest accuracy on larger training sets for Swedish \citep{Megyesi01}. Second, the words were removed from the PoS tagged text, and the PoS sequences were labeled with phrase categories by each parser. The PoS tagger {\sc tnt}, as well as the three parsers (given when only PoS sequences were included in training) were trained on 500k tokens. The reason for training the PoS tagger, {\sc tnt}, on 500k correctly annotated tokens only (and not on the whole SUC corpus) was to assure not to include any of the test sentences into the training data. 

The results are shown in Table~\ref{worldeval}. The performance of the PoS tagger is 94.98\%. As was expected, the performance of the parsers is considerably lower than was reported for correct PoS sequences in Section~\ref{size}. {\sc fntbl} achieves the highest accuracy, followed by {\sc mxpost} and {\sc tnt}, just as in the fourth experiment, described in Sections~\ref{perfclass} and \ref{effectoffeatures}.  

\setlength{\tabcolsep}{6pt}
\begin{table}[h]
\begin{center}
\begin{tabular}{c|lllll|lllllll}
\hline
\noalign{\smallskip}
{\sc tagger} & {\sc parser} & {\sc result} \\
\noalign{\smallskip}
\hline
\noalign{\smallskip}
& {\sc fntbl} & {\bf 90.83\%} \\
94.98\% & {\sc mxpost} & 88.87\% \\
& {\sc tnt} & 88.19\% \\
\noalign{\smallskip}
\hline
\end{tabular}
\caption{Accuracy (\%) is given for each classifier when the test set was first tagged by the PoS tagger {\sc tnt}, then parsed with the three classifiers based on PoS sequences.\label{worldeval}}
\end{center}
\end{table} 

Thus, we have seen that the morphological and shallow syntactic annotation of words in texts, including PoS with morphological features and the whole hierarchical phrase structure that the word belongs to, is possible with approximately 90\% correct result.  

\section{Conclusion and Future Work}
This paper has presented empirical results on the application of publicly available PoS taggers to shallow parsing of Swedish texts. The PoS taggers included in the study are the transformation-based learner {\sc fntbl}, the maximum entropy approach {\sc mxpost}, and the hidden Markov model Trigrams'n'Tags ({\sc tnt}). The goal of the shallow parsers is to recognize the constituent structure of the sentence, representing the whole hierarchical structure the token belongs to in the parse tree. The encoding is based on the concatenation of the phrase tags on the path from lowest to higher nodes. The results show that the data-driven, language and tag set independent PoS taggers can be efficiently used for shallow parsing of texts, given that PoS information only, i.e.\ without the presence of words, is included in the training data. 

Several aspects of the classifiers were evaluated, such as the taggers' sensitivity to certain kinds of linguistic information included in the training data. Particular attention has been directed to the various types of input features that the taggers learn from, such as words, PoS tags, and a combination of both. Also, experiments have been carried out on various numbers of target classes that the taggers have to search through in order to predict phrasal categories only, or to recognize both PoS and the phrasal structure of the tokens. Furthermore, the taggers' sensitivity to the size of the training set including different linguistic information was investigated. Additionally, some of the effects of the parameter settings used by PoS taggers have been examined. 

The results show that for all three systems, best performance is obtained if the number of token types the taggers learn from is reduced by only considering the PoS tags. By excluding the lexical information during learning and testing, all classifiers obtain an accuracy above 92\% when the training set contains at least 50k tokens. 

However, the type of linguistic information, the size of the training set, the algorithms, as well as the parameters used by the algorithms are all factors that influence system performance. This study shows clear differences between the taggers' sensitivity to the type of information used in learning, and the number of target classes to be learned.
 
The transformation-based learner, {\sc fntbl}, obtains best results when training is performed on PoS categories alone. However, {\sc fntbl} does not succeed as well as the statistical approaches in the analysis of the unknown tokens. 

The maximum entropy learner, {\sc mxpost}, is most successful on average when training is performed on large data sets containing lexical information with or without PoS tags. {\sc mxpost} succeeds best among the three systems when the training data includes many different token types. 

The hidden Markov model based {\sc tnt} outperforms all systems when the size of the training set is small and includes lexical information with or without PoS information included in training. When small training set is used, the percentage of unknown tokens is considerably large, making classification more difficult.  

The taggers described in this study were used with their default settings, including different types of background knowledge for the analysis of unknown words and the disambiguation of known words. Also, a pilot study was carried out training {\sc fntbl} with the same lexical and/or contextual parameters as {\sc mxpost} and {\sc tnt} use. Furthermore, {\sc tnt} was trained with the same lexical parameters as {\sc fntbl} and {\sc mxpost} use with and without smoothing. The preliminary results show that both the parameters implemented in the systems, the algorithm bias and the smoothing involved in learning play an important role in the systems' performance. 

Future work includes a careful investigation of the algorithms applied to parsing when using the same parameter settings (context window size and number of characters) for each algorithm trained on various linguistic information. Additionally, since the algorithms were not optimized for Swedish, it would be necessary to investigate the best combination of features in the parameter settings of the systems. 

Future work also includes the improvement of the detection of maximal projections, and the provision of automatic data-driven detection of clause boundaries, such as relative clauses and other subordinate clauses for Swedish. In order to further improve parsing accuracy, the best combinations of the approaches could be determined by constructing ensembles of classifiers.

Lastly, since the method described in this paper can be assumed to be directly applicable to other languages as well, it would be very interesting to find out how well it would perform on various language types including languages with complex morphology and free word order.
 
\bigskip
\\
{\bf Acknowledgments}

I am very grateful to my supervisor Rolf Carlson, the three anonymous reviewers for their valuable and helpful comments on the draft manuscript, and all the researchers who created the PoS taggers used in this study including Thorsten Brants, Radu Florian, Grace Ngai, and Adwait Ratnaparkhi. Also, I would like to thank Anette Hulth, Sara Rydin, Mattias Heldner, and Fredrik Olsson for discussions and comments, and Sheri Hunnicutt for proof-reading. Whatever errors remain are, of course, all mine. This research was supported by {\sc vinnova}, {\sc ctt}'s industrial partners, {\sc kth}, and {\sc hp} {\sc v}oice {\sc w}ebb {\sc i}nitiative. 
\bigskip
\newpage
{\center\bf\large Appendix}
\begin{table}[H]
\setlength{\tabcolsep}{2pt}
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|l|l|r|r|r|r|r|r|r|r|r}
\hline
\multicolumn{1}{c|}{Feature} & \multicolumn{1}{c|}{Class} & \multicolumn{1}{c|}{Rate} & \multicolumn{1}{c|}{{\sc\small advp}} & \multicolumn{1}{c|}{{\sc\small ap}} & \multicolumn{1}{c|}{{\sc\small apmax}} & \multicolumn{1}{c|}{{\sc\small infp}} & \multicolumn{1}{c|}{{\sc\small np}} & \multicolumn{1}{c|}{{\sc\small npmax}} & \multicolumn{1}{c|}{{\sc\small nump}} & \multicolumn{1}{c|}{{\sc\small pp}} & \multicolumn{1}{c}{{\sc\small vc}} \\
& \multicolumn{1}{c|}{Total}  & & 5,970 & 10,477 & 1,433 & 2,541 & 53,810 & 24,350 & 1,951 & 29,419 & 16,282 \\
\noalign{\smallskip}
\hline
\hline
& & {\sc p} & 84.16 & 77.01 & 18.74 & 86.24 & 95.20 & 78.29 & 95.84 & 86.00 &93.31 \\
 & {\sc fntbl} & {\sc r} & 82.91 & 71.14 & 11.65 & 88.55 & 93.12 & 76.39 & 85.08 & 84.03 & 93.59 \\
 & & {\sc f-b1} & 83.53 & 73.96 & 14.37 & 87.38 & 94.15 & 77.33 & 90.14 & 85.00 & 93.45 \\
\cline{2-12}
 & & {\sc p} & 81.26 & 84.97 & 50.21 & 94.34 & 96.83 & 87.90 & 92.22 & 92.87 & 94.36 \\
W \rightarrow PoS+Ph & {\sc mxpost} & {\sc r} & 81.16 & 72.55 & 16.47 & 93.23 & 94.49 & 72.34 & 86.93 & 84.36 & 97.90 \\
 & & {\sc f-b1} & 81.21 & 78.27 & 24.80 & 93.78 & 95.65 & 79.36 & 89.50 & 88.41 & 96.10 \\
\cline{2-12}
 & & {\sc p} & 83.41 & 82.98 & 41.72 & 88.09 & 97.04 & 73.64 & 93.95 & 82.95 & 95.97 \\
 & {\sc t}n{\sc t} & {\sc r} & 83.52 & 77.78 & 20.03 & 89.65 & 95.61 & 61.18 & 89.90 & 75.91 & 97.68 \\
 & & {\sc f-b1} & 83.46 & 80.29 & 27.07 & 88.86 & 96.32 & 66.83 & 91.88 & 79.27 & 96.82 \\
\hline
\hline 
 & & {\sc p} & 82.95 & 74.43 & 34.34 & 87.32  & 95.09 & 79.55 & 95.97 & 87.53 & 92.04 \\
 & {\sc fntbl} & {\sc r} & 82.95 & 74.26 & 16.68 & 85.40 & 93.32 & 69.89 & 86.57 & 78.95 & 95.04 \\
 & & {\sc f-b1} & 82.95 & 74.34 & 22.45 & 86.35 & 94.20 & 74.41 & 91.03 & 83.02 & 93.52 \\
\cline{2-12}
 & & {\sc p} & 84.41 & 86.17 & 61.22 & 96.61 & 95.88 & 88.78 & 94.44 & 93.88 & 94.16 \\
W \rightarrow Ph & {\sc mxpost} & {\sc r} & 77.74 & 73.17 & 18.84 & 84.14 & 95.19 & 72.84 & 85.39 & 84.10 & 97.27 \\
 & & {\sc f-b1} & 80.93 & 79.14 & 28.81 & 89.94 & 95.53 & 80.02 & 89.69 & 88.72 & 95.69 \\
\cline{2-12}
 & & {\sc p} & 82.02 & 80.78 & 36.53 & 87.82 & 95.98 & 73.88 & 93.53 & 83.15 & 95.01 \\
 & {\sc t}n{\sc t} & {\sc r} & 81.12 & 77.75 & 22.61 & 86.86 & 95.04 & 60.15 & 89.60 & 73.96 & 96.90 \\
 & & {\sc f-b1} & 81.57 & 79.24 & 27.93 & 87.34 & 95.51 & 66.32 & 91.52 & 78.29 & 95.95 \\
\hline
\hline 
 & & {\sc p} & 99.70 & 85.15 & 48.45 & 94.82 & 97.77 & 84.98 & 96.84 & 90.79 & 99.12 \\
 & {\sc fntbl} & {\sc r} & 98.84 & 86.93 & 35.80 & 92.99 & 96.99 & 77.40 & 94.31 & 85.19 &  99.31 \\
 & & {\sc f-b1} & 99.27 & 86.03 & 41.18 & 93.90 & 97.38 & 81.01 & 95.56 & 87.90 &  99.21 \\
\cline{2-12}
 & & {\sc p} & 98.79 & 93.08 & 79.02 & 98.78 & 98.95 & 93.45 & 96.56 & 95.93 & 99.15 \\
W+PoS \rightarrow Ph & {\sc mxpost} & {\sc r} & 98.14 & 85.45 & 33.64 & 95.79 & 97.57 & 77.25 & 92.11 & 86.43 & 99.74 \\
 & & {\sc f-b1} & 98.46 & 89.10 & 47.19 & 97.26 & 98.26 & 84.58 & 94.28 & 90.93 & 99.44 \\
\cline{2-12}
 & & {\sc p} & 99.23 & 90.59 & 56.62 & 91.61 & 98.45 & 79.44 & 98.25 & 87.14 & 99.30 \\
 & {\sc t}n{\sc t} & {\sc r} & 99.65 & 88.01 & 34.61 & 91.97 & 97.45 & 67.67 & 95.23 & 79.24 & 98.99 \\
 & & {\sc f-b1} & 99.44 & 89.28 &  42.96 &91.79 & 97.95 & 73.08 & 96.72 & 83.00 & 99.14 \\
\hline
\hline 
 & & {\sc p} & 78.75 & 95.18 & 96.09 & 100.00 & 99.39 & 97.84 & 98.65 & 98.59 & 100.00 \\
 & {\sc fntbl} & {\sc r} & 82.36 & 95.49 & 78.86 & 99.96 & 99.12 & 97.34 & 97.44 & 97.62 & 99.99 \\
 & & {\sc f-b1} & 80.51 & 95.33 & 86.63 & 99.98 & 99.25 & 97.59 & 98.04 & 98.10 & 99.99 \\
\cline{2-12}
 & & {\sc p} & 78.13 & 95.44 & 85.28 & 99.84 & 99.34 & 96.70 & 97.41 & 96.90 & 99.89 \\
PoS \rightarrow Ph & {\sc mxpost} & {\sc r} & 77.19 & 88.57 & 67.13 & 98.07 & 98.12 & 79.47 & 96.41 & 89.89 & 99.99 \\
 & & {\sc f-b1} & 77.66 & 91.88 & 75.12 & 98.95 & 98.73 & 87.24 & 96.91 & 93.26 & 99.94 \\
\cline{2-12}
 & & {\sc p} & 67.75 & 94.06 & 79.49 & 96.87 & 99.12 & 95.55 & 98.71 & 96.78 & 100.00 \\
 & {\sc t}n{\sc t} & {\sc r} & 88.58 & 93.79 & 76.55 & 99.76 & 98.63 & 95.89 & 98.05 & 96.55 & 99.68 \\
 & & {\sc f-b1} & 76.78 & 93.92 & 77.99 & 98.29 & 98.87 & 95.72 & 98.38 & 96.66 & 99.84 \\
\hline
\end{tabular}
\caption{Precision {\sc (p)}, recall {\sc (r)} and F$_{\beta=1}$ {\sc (f-b1)} rates for each classifier when trained on 200k tokens on the four types of input features, and tested on 117,530 tokens.\label{PRF}}
\end{center}
\end{table}
\newpage

\begin{table}[H]
\setlength{\tabcolsep}{3.5pt}
\begin{center}
\noalign{\smallskip}
\begin{tabular}{l|l|l|rrr|rrr|rrr}
\hline
\multicolumn{1}{c|}{\sc feature} & \multicolumn{2}{c|}{\sc parameter} & \multicolumn{3}{c|}{\sc 2k} & \multicolumn{3}{c|}{\sc 20k} & \multicolumn{3}{c}{\sc 200k} \\
\hline
& \multicolumn{1}{c|}{\sc Lexical} & \multicolumn{1}{c|}{\sc Context} & \multicolumn{1}{c}{\sc t} &\multicolumn{1}{c}{\sc k} &\multicolumn{1}{c|}{\sc u} & \multicolumn{1}{c}{\sc t} & \multicolumn{1}{c}{\sc k} & \multicolumn{1}{c|}{\sc u} & \multicolumn{1}{c}{\sc t} & \multicolumn{1}{c}{\sc k} &\multicolumn{1}{c}{\sc u} \\
\noalign{\smallskip}
\hline
\hline
& {\sc fntbl} & original & 44.3 & 67.3 & 15.6 & 58.5 & 70.8 & 25.8 & 72.8 & 77.6 & 36.9 \\
& {\sc mxpost} & original & 40.7 & 56.2 & 21.3 & 62.9 & 70.5 & 42.5 & 77.9 & 80.1 & 61.2 \\
& {\sc t}n{\sc t} & original & 49.5 & 68.9 & 25.2 & 63.2 & 71.1 & 42.1 & 72.2 & 75.0 & 51.9 \\
\cline{2-12}
& {\sc fntbl} & {\sc mxpost} & 42.5 & 66.8 & 12.1 & 58.6 & 69.9 & 28.4 & 70.7 & 75.1 & 38.3 \\
W \rightarrow PoS+Ph & {\sc mxpost} & {\sc fntbl} & 43.0 & 67.1 & 12.9 & 59.0 & 71.0 & 27.0 & 72.7 &  77.6 & 36.2  \\
& {\sc mxpost} & {\sc mxpost} & 43.2 & 66.6 & 14.0 & 58.1 & 70.0 & 26.1 & 70.9 & 75.3 & 38.1  \\
\cline{2-12}
& {\sc fntbl} &  {\sc t}n{\sc t} & 43.1 & 65.8 & 14.8 & 56.4 & 67.4 & 27.1 & 64.5 & 71.2 & 14.6 \\
&  {\sc t}n{\sc t} & {\sc fntbl} & 44.5 & 67.8 & 15.4 & 58.9 & 71.1 & 26.1 & 72.4 & 77.5 & 34.4  \\
& {\sc t}n{\sc t} &  {\sc t}n{\sc t} & 43.5 & 65.7 & 15.7 & 56.1 & 67.3 & 26.2 &64.7 & 71.6 & 12.9 \\
\hline
\hline
& {\sc fntbl} & original & 48.5 & 70.4 & 21.0 & 64.4 & 79.6 & 45.5 & 75.1 & 77.9 & 54.4 \\
& {\sc mxpost} & original & 50.6 & 63.9 & 33.9 & 72.5 & 77.2 & 59.8 & 81.7 & 83.0 & 72.0 \\
& {\sc t}n{\sc t} & original & 57.9 & 72.8 & 39.4 & 66.9 & 72.7 & 51.4 & 72.8 & 75.0 & 56.3 \\
\cline{2-12}
& {\sc fntbl} & {\sc mxpost} & 48.9 & 70.6 & 24.0 & 63.4 & 73.3 & 36.9 & 73.7 & 78.1 & 41.3 \\
W \rightarrow Ph & {\sc mxpost} & {\sc fntbl} & 47.9 & 70.6 & 19.7 & 64.5 & 73.4 & 40.7 & 72.2 & 77.5 & 33.0 \\
& {\sc mxpost} & {\sc mxpost} & 47.8 & 57.8 & 20.8 & 64.0 & 73.4 & 38.9 & 73.1 & 77.9 & 37.3 \\
\cline{2-12}
& {\sc fntbl} &  {\sc t}n{\sc t} & 48.3 & 70.2 & 21.0 & 60.9 & 71.2 & 33.4 & 71.4 & 74.7 & 47.0 \\
&  {\sc t}n{\sc t} & {\sc fntbl} & 48.1 & 70.8 & 19.8 & 63.9 & 73.3 & 38.8 & 75.1 & 77.9 & 54.4 \\
& {\sc t}n{\sc t} &  {\sc t}n{\sc t} & 48.5 & 70.4 & 21.2 & 61.8 & 71.9 & 34.8 & 71.4 & 74.7 & 47.1 \\
\hline
\hline
& {\sc fntbl} & original & 60.8 & 74.8 & 44.8 & 76.2 & 79.3 & 68.5 & 83.3 & 83.5 & 82.4 \\
& {\sc mxpost} & original & 71.3 & 79.0 & 62.6 & 84.6 & 86.7 & 79.3 & 87.9 & 88.4 & 84.3 \\
& {\sc t}n{\sc t} & original & 73.1 & 79.7 & 65.5 & 78.5 & 78.9 & 77.5 & 79.9 & 80.0 & 79.5 \\
\cline{2-12}
& {\sc fntbl} & {\sc mxpost} & 63.4 & 77.5 & 47.3 & 74.5 & 79.1 & 63.1 & 81.8 & 82.6 & 76.7 \\
W+PoS \rightarrow Ph & {\sc mxpost} & {\sc fntbl} & 60.5 & 74.8 & 44.2 & 75.5 & 79.1 & 66.3 & 83.2 & 83.6 & 80.0 \\
& {\sc mxpost} & {\sc mxpost} & 60.0 & 77.6 & 39.8 & 74.7 & 79.1 & 63.8 & 80.9 &82.6 & 69.8 \\
\cline{2-12}
& {\sc fntbl} &  {\sc t}n{\sc t} & 61.9 & 76.9 & 44.7 & 72.8 & 76.7 & 63.1 & 78.1 & 79.7 & 67.1 \\
&  {\sc t}n{\sc t} & {\sc fntbl} & 60.8 & 74.8 & 44.5 & 76.2 & 79.4 & 68.1 & 81.3 & 83.4 & 66.8 \\
& {\sc t}n{\sc t} &  {\sc t}n{\sc t} & 62.5 & 77.3 & 45.7 & 73.0 & 76.8 & 63.4 & 77.0 & 79.6 & 59.0 \\
\hline
\hline
& {\sc fntbl} & original & 78.6 & 78.9 & 49.7 & 89.4 & 89.5 & 15.5 & 94.8 & 94.8 & 70.4 \\
& {\sc mxpost} & original & 75.5 & 75.8 & 47.1 & 88.4 & 88.5 & 38.0 & 90.0 & 90.0 & 20.0 \\
& {\sc t}n{\sc t} & original & 76.6 & 76.8 & 50.2 & 88.1 & 88.2 & 33.8 & 92.0 & 92.1 & 40.0 \\
\cline{2-12}
& {\sc fntbl} & {\sc mxpost} & 76.6 & 76.9 & 47.3 & 87.4 & 87.5 & 22.5 & 91.7 & 91.7 & 00.0\\
PoS \rightarrow Ph & {\sc mxpost} & {\sc fntbl} & 78.3 & 79.0 & 8.6 & 89.4 & 89.5 & 19.7 & 94.8 & 94.8 & 00.0 \\
& {\sc mxpost} & {\sc mxpost} & 76.6 & 76.9 & 47.3 & 87.4 & 87.5 & 0.0 & 91.7 & 91.7 & 00.0 \\
\cline{2-12}
& {\sc fntbl} &  {\sc t}n{\sc t} & 73.6 & 73.9 & 44.1 & 81.1 & 81.1 & 18.3 & 83.4 & 83.4  & 00.0 \\
&  {\sc t}n{\sc t} & {\sc fntbl} & 78.2 & 79.0 & 0.0 & 89.4 & 89.5 & 19.7 & 94.8 & 94.8 & 00.0 \\
& {\sc t}n{\sc t} &  {\sc t}n{\sc t} & 73.5 & 73.8 & 37.4 & 81.1 & 81.1 & 0.0 & 83.4 & 83.4 & 00.0 \\
\hline
\hline
\end{tabular}
\caption{The results are given for {\sc fntbl} using different lexical and contextual parameters of {\sc mxpost} and {\sc tnt}, trained on 2k, 20k and 200k tokens on the four types of input features, and tested on 117,530 tokens. Accuracy (\%) is calculated for the total number of tokens ({\sc t}), as well as for known ({\sc k}) and unknown ({\sc u}) tokens. \label{simul}}
\end{center}
\end{table}

\begin{thebibliography}{}
\bibitem[Abney, 1991]{Abney91} 
%(Abney, 1991)
Steven~Abney. 
Parsing by Chunks. 
In {\em Principle-Based Parsing}. Kluwer Academic Publishers, 1991.

\bibitem[Argamon \bgroup {\em et al.}, \egroup 1998]{Arg98}
%(Argamon {\em et al.}, 1998)
Shlomo~Argamon, Ido~Dagan and Yuval~Krymolowsky.
 {A Memory-Based Approach to Learning Shallow Natural Language Patterns.}
 In {\em Proceedings of 36th Annual Meeting of the Association for Computational Linguistics (ACL)}, pp. 67-73, Montreal, Canada, 1998.

\bibitem[Aycock, 1998]{Aycock98} 
%(Aycock, 1998)
John~Aycock. 
Compiling Little Languages in Python. 
In {\em Proceedings of the 7th International Python Conference}, 1998. 

\bibitem[Brants, 1999]{Brants99} 
%(Brants, 1999)
Thorsten~Brants.
{Cascaded Markov Models.}
 In {\em Proceedings of the 9th Conference of the European Chapter of the Association for Computational Linguistics (EACL-99)}, Bergen, Norway, 1999.

\bibitem[Brants, 2000]{Brants00} 
%(Brants, 2000)
Thorsten~Brants.
{TnT - A Statistical Part-of-Speech Tagger.}
 In {\em Proceedings of the 6th Applied Natural Language Processing Conference}, Seattle, Washington, USA, 2000.

\bibitem[Brill, 1993]{Brill93}
%(Brill, 1993)
Eric~Brill.
 {Automatic Grammar Induction and Parsing Free Text: a Transformation-Based Approach.}
 In {\em Meeting of the Association for Computational Linguistics (ACL)}, pp. 259-265, 1993.

\bibitem[Brill, 1994]{Brill94}
%(Brill, 1994)
Eric~Brill.
 {Some Advances in Rule-Based Part of Speech Tagging.}
 In {\em Proceedings of the 12th National Conference on Artificial Intelligence (AAAI-94)}, Seattle, Washington, 1994.

\bibitem[Buchholz \bgroup {\em et al.}, \egroup 1999]{Buch99}
%(Buchholz {\em et al.}, 1999)
Sabine~Buchholz, Jorn~Veenstra and Walter~Daelemans.
 {Cascaded Grammatical Relation Assignment.}
 In {\em Proceedings of the 1999 Joint SIGDAT Conference on Empirical Methods in Natural Language Processing and Very Large Corpora}, 1999.

\bibitem[Cardie and Pierce, 1998]{Cardie98}
%(Cardie \& Pierce, 1998)
Claire~Cardie and David~Pierce.
{Error-Driven Pruning of Treebank Grammars for Base Noun Phrase Identification.}
In {\em Proceedings of COLING/ACL}, pp 218-224, Montreal, Canada, 1998.

\bibitem[Church, 1988]{Church88}
%(Church, 1988)
Kenneth Church.
{A Stochastic Parts Program and Noun Phrase Parser for Unrestricted Texts.}
{In {\em Proceedings of the Second Conference on Applied Natural Language Processing}, pp. 136-143. Association for Computational Linguistics, 1988.

\bibitem[Cussens, 1998]{Cussens98}
%(Cussens, 1998)
James Cussens.
{Notes on Inductive Logic Programming Methods in Natural Language Processing} (European Work). Manuscript (http://www.cs.york.ac.uk/mlg/), 1998.

\bibitem[Daelemans \bgroup {\em et al.}, \egroup 1996]{Dael96}
%(Daelemans {\em et al.}, 1996)
Walter~Daelemans, Jakub~Zavrel, Peter~Berck and Steven E.~Gillis.
{MBT: a Memory-Based Part of Speech Tagger-Generator.}
In {\em Proceedings of Fourth Workshop on Very Large Corpora (VLC-96)}, pp. 14-27, Copenhagen, Denmark, 1996.

\bibitem[Daelemans \bgroup {\em et al.}, \egroup 1999]{Dael99}
%(Daelemans {\em et al.}, 1999)
Walter~Daelemans, Antal~van den Bosch and Jakub~Zavrel.
{Forgetting exceptions is harmful in language learning.}
In {\em Machine Learning}, 34, 1999.

\bibitem[De Pauw and Daelemans, 2000]{dePauw00}
%(De Pauw and Daelemans, 2000)
Guy~De Pauw and Walter~Daelemans.
 {The Role of Algorithm Bias vs Information Source in Learning Algorithms for Morphosyntactic Disambiguation.}
 In {\em Proceedings of Computational Natural Language Learning (CoNLL-00)}, pp. 19-24, Lisbon, Portugal, 2000.

\bibitem[Eineborg and Lindberg, 2000]{Eineborg00}
%(Eineborg and Lindberg, 2000)
Martin~Eineborg and Nikolaj~Lindberg.
 {ILP in part-of-speech tagging - An overview.}
 In Cussens, J and Dzeroski, S (editors), {\em Learning Language in Logic Workshop (LLL99)}, Bled, Slovenia. 2000.

\bibitem[Ejerhed {\em et al.}, 1992]{Eje92}
%(Ejerhed {\em et al.}, 1992)
Eva~Ejerhed, Gunnel~K\"{a}llgren, Ola~Wennstedt and Magnus~\AA str\"{o}m.
 {\em The Linguistic Annotation System of the Stockholm-Ume{\aa} Project.} 
 {Dept. of General Linguistics, University of Ume\aa}, 1992.

\bibitem[Gee and Grosjean, 1983]{Gee83}
%(Gee and Grosjean, 1983)
James Paul Gee and Fran\c{c}ois Grosjean.
 Performance Structures: A psycholinguistic and linguistic appraisal. 
 In {\em Cognitive Psychology}, 15, pp. 411-458, 1983.

\bibitem[Marcus {\em et al.}, 1994]{Marcus94}
%(Marcus {\em et al.}, 1994)
Mitchell~Marcus, Grace~Kim, Mary Ann~Marcinkiewicz, Robert~MacIntyre, Ann~Bies, Mark~Ferguson, Karen~Katz and Britta~Schasberger.
 The Penn Treebank: A Revised Corpus Design for Extracting Predicate Argument Structure.
 In {\em Human Language Technology, ARPA March 1994 Workshop}, Morgan Kaufmann, 1994.

\bibitem[Megyesi, 2001]{Megyesi01}
%(Megyesi, 2001)
Be\'{a}ta~Megyesi.
 Comparing Data-Driven Learning Algorithms for PoS Tagging of Swedish.
 In {\em Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2001).} pp. 151-158, Carnegie Mellon University, Pittsburgh, PA, USA, June, 2001.

\bibitem[Megyesi, 2001a]{Megyesi011}
%(Megyesi, 2001a)
Be\'{a}ta~Megyesi.
 Phrasal Parsing by Using Data-Driven PoS Taggers.
 In {\em Proceedings of Recent Advances in Natural Language Processing (EuroConference RANLP-2001)}, Tzigov Chark, Bulgaria, September, 2001.

\bibitem[Ngai and Florian, 2001]{Ngai01}
%(Ngai and Florian, 2001)
Grace Ngai and Radu Florian.
Transformation-Based Learning in the Fast Lane.
In {\em Proceedings of North American Chapter of the Association for Computational Linguistics (NAACL-2001)}, pp. 40--47, June, 2001.

\bibitem[Osborne, 2000]{Osborne00}
%(Osborne, 2000)
Miles~Osborne.
 Shallow Parsing as Part-of-Speech Tagging.
 In {\em Proceedings of CoNLL-2000 and LLL-2000}, pp. 145-147, Lisbon, Portugal, 2000.

\bibitem[Ramshaw and Marcus, 1995]{Ramshaw95}
%(Ramshaw and Marcus, 1995)
Lance A.~Ramshaw and Mitchell P.~Marcus.
 Text Chunking Using Transformation-Based Learning.
 In {\em Proceedings of the Third ACL Workshop on Very Large Corpora}, Association for Computational Linguistics, 1995.

\bibitem[Ratnaparkhi, 1996]{Ratna96} 
%(Ratnaparkhi, 1996)
Adwait~Ratnaparkhi. 
A Maximum Entropy Model for Part-of-Speech Tagging. In {\em Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP-96)}, Philadelphia, PA, USA, 1996.

\bibitem[Skut and Brants, 1998]{Brants98} 
%(Skut and Brants, 1998)
Wojciech Skut and Thorsten~Brants.
{Chunk Tagger Statistical Recognition of Noun Phrases.}
 In {\em ESSLLI-98 Workshop on Automated Acquisition of Syntax and Parsing (ESSLLI-98)}, Saarbr\"{u}cken, Germany, 1998.

\bibitem[Tjong Kim Sang \& Buchholz, 2000]{Tjong00}
%(Tjong Kim Sang \& Buchholz, 2000)
Erik~Tjong Kim Sang and Sabine~Buchholz.
 Introduction to the CoNLL-2000 Shared Task: Chunking.
 In {\em Proceedings of CoNLL and LLL-2000}, pp. 127-132, Lisbon, Portugal, 2000.

\bibitem[van Halteren, 1999]{Halt99}
%(van Halteren, 1999)
Hans~van Halteren (editor).
 {\em Syntactic Wordclass Tagging}.
 {Kluwer Academic Publishers, Dordrecht, The Netherlands, 1999.}

\bibitem[Veenstra, 1999]{Veen99}
%(Veenstra, 1999)
Jorn~Veenstra.
 Memory-Based Text Chunking.
 In {\em Workshop on Machine Learning in Human Language Technology}, ACAI-99, Crete, Greece, 1999.

\bibitem[Zavrel and Daelemans, 1999]{Zav99}
%(Zavrel and Daelemans, 1999)
Jakub~Zavrel and Walter~Daelemans.
{Recent Advances in Memory-Based Part-of-Speech Tagging.}
In {\em Proceedings of the VI Simposio Internacional de Comunicacion Social}, pp. 590-597, Santiago de Cuba, 1999.

\end{thebibliography}

\end{document}