# Setup: First download the corpus from here, unzip it, and put # this source file in the same directory as suppes.parsed # # https://www.socsci.uci.edu/~lpearl/CoLaLab/CHILDESTreebank/CHILDESTreebank-curr.zip # # For more info, see: # # https://www.socsci.uci.edu/~lpearl/CoLaLab/CHILDESTreebank/childestreebank.html # Trees def tree(label, branches=[]): for branch in branches: assert is_tree(branch), 'branches must be trees' return [label] + list(branches) def label(tree): return tree[0] def branches(tree): return tree[1:] def is_tree(tree): if type(tree) != list or len(tree) < 1: return False for branch in branches(tree): if not is_tree(branch): return False return True def is_leaf(tree): return not branches(tree) def print_tree(t, indent=0): """Print a representation of this tree in which each label is indented by two spaces times its depth from the root. """ print(' ' * indent + str(label(t))) for b in branches(t): print_tree(b, indent + 1) def leaves(t): """Yield the leaves of a tree.""" if is_leaf(t): return [t] else: return sum([leaves(b) for b in branches(t)], []) # Syntax treebank_examples = """ (ROOT (SBARQ (WHNP (WP what)) (SQ (VP (AUX is)) (NP (DT the) (NN rabbit)) (VP (VBG doing))) (. ?))) (ROOT (SQ (AUX is) (NP (PRP he)) (VP (VBG hopping)) (. ?))) """.split('\n') def phrase(tag, branches): return tree(tag, branches) def word(tag, text): return tree([tag, text]) def tag(t): """Return the tag of a phrase or word.""" if is_leaf(t): return label(t)[0] else: return label(t) def text(word): return label(word)[1] def read_sentences(input): """Yield parsed sentences as lists of tokens for a list of lines. >>> for s in read_sentences(treebank_examples): ... print(' '.join(s[:20]), '...') ( ROOT ( SBARQ ( WHNP ( WP what ) ) ( SQ ( VP ( AUX is ) ) ... ( ROOT ( SQ ( AUX is ) ( NP ( PRP he ) ) ( VP ( VBG hopping ... """ sentences = [] tokens = [] for line in input: if line.strip(): tokens.extend(line.replace('(', ' ( ').replace(')', ' ) ').split()) if tokens.count('(') == tokens.count(')'): sentences.append(tokens) tokens = [] return sentences def all_sentences(): return read_sentences(open('suppes.parsed').readlines()) def tokens_to_parse_tree(tokens): """Return a tree for a list of tokens representing a parsed sentence. >>> print_tree(tokens_to_parse_tree(read_sentences(treebank_examples)[0])) ROOT SBARQ WHNP ['WP', 'what'] SQ VP ['AUX', 'is'] NP ['DT', 'the'] ['NN', 'rabbit'] VP ['VBG', 'doing'] ['.', '?'] """ assert tokens[0] == '(', tokens t, end = read_parse_tree(tokens, 1) return t def is_valid_tree(t): return t and tag(t) def read_parse_tree(tokens, i): """Return a tree for the next constitutent of a token iterator and the end index.""" tag = tokens[i] i += 1 if tokens[i] != '(': assert tokens[i+1] == ')' return word(tag, tokens[i]), i + 2 branches = [] while tokens[i] != ')': assert tokens[i] == '(' branch, i = read_parse_tree(tokens, i + 1) if is_valid_tree(branch): branches.append(branch) if branches: return phrase(tag, branches), i + 1 else: return None, i + 1 def print_parse_tree(t): """Print the parse tree in its original format. >>> print_parse_tree(tokens_to_parse_tree(read_sentences(treebank_examples)[0])) '(ROOT (SBARQ (WHNP (WP what)) (SQ (VP (AUX is)) (NP (DT the) (NN rabbit)) (VP (VBG doing))) (. ?)))' """ if is_leaf(t): return '(' + tag(t) + ' ' + text(t) + ')' else: result = '(' + tag(t) for b in branches(t): result += ' ' + print_parse_tree(b) result += ')' return result from string import punctuation def words(t): """Return the words of a tree as a string. >>> words(tokens_to_parse_tree(read_sentences(treebank_examples)[0])) 'what is the rabbit doing?' """ s = '' for leaf in leaves(t): w = text(leaf) if not s or (w in punctuation and w not in '$') or w in ["n't", "'s", "'re", "'ve"]: s = s + w else: s = s + ' ' + w return s # Sentence Generator def nodes(t): """List all (tag, node) pairs of a parse tree.""" result = [] def traverse(t): result.append([tag(t), t]) for b in branches(t): traverse(b) traverse(t) return result def index_trees(trees): """Return a dictionary from tags to lists of trees.""" index = {} for t in trees: for tag, node in nodes(t): if tag not in index: index[tag] = [] index[tag].append(node) # Also: index.setdefault(tag, list).append(node) return index import random def coin(prob): def flip(): """Return True if a coin flip comes up heads.""" return random.random() < prob return flip def gen_tree(t, tree_index, flip): """Return a version of t in which constituents are randomly replaced.""" new_branches = [] if is_leaf(t): return t for b in branches(t): if flip(): # original = b b = random.choice(tree_index[tag(b)]) # print('Replacing', print_parse_tree(original), 'with', print_parse_tree(b)) new_branches.append(gen_tree(b, tree_index, flip)) return phrase(tag(t), new_branches) def generate(gen=gen_tree): trees = [tokens_to_parse_tree(s) for s in all_sentences() if len(s) > 100] tree_index = index_trees(trees) while True: original = random.choice(trees) print('Original: ', words(original).lower()) # print(' ', print_parse_tree(original)) # input() edited = gen(original, tree_index, coin(0.3)) input() print('Generated:', words(edited).lower()) input() ''' Change of data representation: def phrase(label, branches): return ['(', tag] + sum(branches, []) + [')'] def word(tag, text): return ['(', tag, text, ')'] def tag(t): return t[1] def text(word): return word[2] def branches(tree): if is_leaf(tree): return [] branch = [] branches = [] assert tree[0] == '(' and tree[2] == '(', tree opened = 1 for token in tree[2:]: branch.append(token) if token == '(': opened += 1 if token == ')': opened -= 1 if opened == 1: branches.append(branch) branch = [] assert opened == 0, tree return branches def is_leaf(tree): return len(tree) == 4 '''