import pandas as pd
from conll_df import conll_df
path ='en-ud-train.txt.conllu'
df = conll_df(path, file_index=False)
df.head(40).to_html()
Output (truncated):
w
l
x
p
g
f
e
type
gender
Case
Definite
Degree
Foreign
Gender
Mood
Number
Person
Poss
Reflex
Tense
Voice
Type
s
i
1
1.0
Al
Al
PROPN
NNP
0
root
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
2.0
-
-
PUNCT
HYPH
1
punct
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
3.0
Zaman
Zaman
PROPN
NNP
1
flat
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
4.0
:
:
PUNCT
:
1
punct
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
5.0
American
american
ADJ
JJ
6
amod
_
_
_
_
_
Pos
_
_
_
_
_
_
_
_
_
_
6.0
forces
force
NOUN
NNS
7
nsubj
_
_
_
_
_
_
_
_
_
Plur
_
_
_
_
_
_
7.0
killed
kill
VERB
VBD
1
parataxis
_
_
_
_
_
_
_
_
Ind
_
_
_
_
Past
_
_
8.0
Shaikh
Shaikh
PROPN
NNP
7
obj
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
9.0
Abdullah
Abdullah
PROPN
NNP
8
flat
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
10.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2
1.0
[
[
PUNCT
-LRB-
10
punct
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
2.0
This
this
DET
DT
3
det
_
Dem
_
_
_
_
_
_
_
Sing
_
_
_
_
_
Dem
3.0
killing
killing
NOUN
NN
10
nsubj
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
4.0
of
of
ADP
IN
7
case
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
5.0
a
a
DET
DT
7
det
_
Art
_
_
Ind
_
_
_
_
_
_
_
_
_
_
Art
6.0
respected
respected
ADJ
JJ
7
amod
_
_
_
_
_
Pos
_
_
_
_
_
_
_
_
_
_
7.0
cleric
cleric
NOUN
NN
3
nmod
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
8.0
will
will
AUX
MD
10
aux
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
9.0
be
be
AUX
VB
10
aux
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
10.0
causing
cause
VERB
VBG
0
root
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
_
Function arguments
Name
Type
Description
path
str
Path to CONLL-U file
add_gov
bool
Create extra columns for governor word, lemma, POS and function
skip_morph
bool
Enable if you'd like to skip the parsing of morphological and extra fields
v2
bool/'auto'
CONLL-U version of file. By default, detect from data
drop
list
list of column names you don't need
add_meta
bool
add columns for sentence-level metadata
categories
bool
Convert columns to categorical format where possible
file_index
bool
Include filename in index levels
extra_fields
list/'auto'
`Names of extra fields in the last column. By default, detect from data
kwargs
dict
additional arguments to pass to pandas.read_csv()
Configuring these arguments can increase speed a lot, so if speed is important to you, turn off the features you don't need.
Where to from here?
If you're working with Python and CONLL-U, you might want to take a look at tücan, which provides a command-line and web-app interface for exploring CONLL-U datasets.
Alternatively, there's plenty of cool stuff you can do with Pandas by itself. Here are some toy examples:
defsearcher(df, column, query, inverse=False):
"""Search column for regex query"""
bool_ix = df[column].str.contains(query)
return df[bool_ix] ifnot inverse else df[~bool_ix]
pd.DataFrame.search = searcher
# get nominal subjects starting with a, b or c
df.search('f', 'nsubj').search('w', '^[abc]').head().to_html()
w
l
x
p
g
f
e
type
gender
Case
Definite
Degree
Foreign
Gender
Mood
Number
Person
Poss
Reflex
Tense
Voice
Type
s
i
3
4.0
authorities
authority
NOUN
NNS
5
nsubj
_
_
_
_
_
_
_
_
_
Plur
_
_
_
_
_
_
8
2.0
cells
cell
NOUN
NNS
4
nsubj
_
_
_
_
_
_
_
_
_
Plur
_
_
_
_
_
_
9
3.0
announcement
announcement
NOUN
NN
6
nsubj:pass
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
12
3.0
commander
commander
NOUN
NN
7
nsubj
_
_
_
_
_
_
_
_
_
Sing
_
_
_
_
_
_
9.0
bombings
bombing
NOUN
NNS
11
nsubj
_
_
_
_
_
_
_
_
_
Plur
_
_
_
_
_
_
Create a concordancer
def_conclines(match, df=False, column=False):
"""Apply this to each sentence"""
s, i = match.name
sent = df['w'].loc[s]
match['left'] = sent.loc[:i-1].str.cat(sep='')
match['right'] = sent.loc[i+1:].str.cat(sep='')
formatted = match['w']
if column !='w':
formatted +='/'+ match[column]
match['match'] = formatted
return match
defconc(df, column, query):
"""Build simple concordancer"""# get query matches
matches = df[df[column].str.contains(query)]
# add left and right columns
lines = matches.apply(_conclines, df=df, column=column, axis=1)
return lines[['left', 'match', 'right']]
pd.DataFrame.conc = conc
lines = df.head(1000).conc('l', 'be')
lines.head(10).to_html()
The Tidelift Subscription provides access to a continuously curated stream of human-researched and maintainer-verified data on open source packages and their licenses, releases, vulnerabilities, and development practices.