Basic-level emergence

Report 0 Downloads 66 Views
Basic-level emergence Import libraries library(tidyverse) ## ## ## ## ## ##

Loading Loading Loading Loading Loading Loading

tidyverse: tidyverse: tidyverse: tidyverse: tidyverse: tidyverse:

ggplot2 tibble tidyr readr purrr dplyr

## Conflicts with tidy packages ---------------------------------------------## filter(): dplyr, stats ## lag(): dplyr, stats library(ggthemes)

Import data raw_clicks = read_delim('../../data/categorical/artificial/clickedObj/clickedObjData.csv', '\t') ## Parsed with column specification: ## cols( ## .default = col_integer(), ## iterationName = col_character(), ## gameid = col_character(), ## time = col_double(), ## object1name = col_character(), ## object2name = col_character(), ## object3name = col_character(), ## object4name = col_character(), ## intendedName = col_character(), ## clickedName = col_character(), ## correct = col_character(), ## condition = col_character(), ## contextType = col_character(), ## eventType = col_character() ## ) ## See spec(...) for full column specifications. raw_drops = read_delim('../../data/categorical/artificial/drop/dropData.csv', '\t') ## Parsed with column specification: ## cols( ## iterationName = col_character(), ## gameid = col_character(), ## time = col_double(), ## intendedName = col_character(), 1

## ## ## ## ## )

trialNum = col_integer(), text = col_character(), timeFromRoundStart = col_integer(), eventType = col_character()

incompletes % group_by(gameid, condition) %>% tally() %>% filter(n < 60))$gameid Filter out incompletes & compute cumulative accuracy. We also divide into quarters to compare games that ran different amounts of trials. d % filter(!(gameid %in% incompletes)) %>% mutate(acc = ifelse(correct == 'true', 1, 0)) %>% group_by(gameid) %>% mutate(condition = case_when(condition == 'over' ~ 'sub-majority', condition == 'under' ~ 'super-majority', condition == 'basic' ~ 'basic-majority', condition == 'uniform' ~ 'uniform')) %>% mutate(numRounds = last(trialNum)) %>% mutate(quarter = floor((trialNum - 1) / (last(trialNum)/4))) %>% mutate(cumAcc = cumsum(acc)) %>% mutate(overallAcc = last(cumAcc)/last(trialNum)) %>% ungroup() %>% left_join(raw_drops, by = c('gameid', 'trialNum', 'intendedName')) %>% select(-ends_with('y'), -ends_with('x'), -starts_with('object'), -correct)

Number games per condition d %>% group_by(gameid, condition) %>% tally() %>% group_by(condition) %>% summarize(n = length(n)) ## ## ## ## ## ## ##

# A tibble: 4 x 2 condition n 1 basic-majority 5 2 sub-majority 6 3 super-majority 3 4 uniform 11

Results Individual accuracy curves

2

ggplot(d, aes(x = trialNum, y = cumAcc, group = gameid)) + geom_line() + theme_few() + guides(color = FALSE) + ylab("cumulative accuracy")

cumulative accuracy

80

60

40

20

0 0

25

50

75

trialNum Accuracy by condition d %>% group_by(condition, trialNum) %>% summarize(meanAcc = mean(cumAcc), se = sd(cumAcc)/sqrt(length(cumAcc))) %>% ggplot(aes(x = trialNum, y = meanAcc, color = condition)) + geom_line() + # geom_errorbar(aes(ymin = meanAcc - se, ymax = meanAcc + se)) + theme_few() + scale_color_colorblind()

3

40

meanAcc

condition basic−majority sub−majority super−majority uniform

20

0 0

25

50

75

trialNum Accuracy by contextType d %>% group_by(contextType, quarter) %>% summarize(meanAcc = mean(acc), se = sd(acc)/sqrt(length(acc))) %>% ggplot(aes(x = quarter, y = meanAcc)) + geom_line() + theme_few() + facet_wrap(~ contextType)

4

basic

sub

super

meanAcc

0.8

0.7

0.6

0.5 0

1

2

3

0

1

2

3

0

1

2

3

quarter One hypothesis is that in conditions where you don’t see very many sub contexts, performance on those contexts should stay really bad or go down as you specialize for the other trials (especially in the ‘under’ condition). In the uniform condition, though, you should see it still going up because people need all the words. d %>% group_by(condition, contextType, quarter) %>% summarize(meanAcc = mean(acc), se = sd(acc)/sqrt(length(acc))) %>% ggplot(aes(x = quarter, y = meanAcc)) + geom_line() + theme_few() + facet_grid(condition ~ contextType) + theme(aspect.ratio = 1)

5

basic

sub

super basic−majority sub−majority super−majority

1.0 0.8 0.6 0.4 1.0 0.8

meanAcc

0.6 0.4 1.0 0.8 0.6 0.4 1.0

uniform

0.8 0.6 0.4 0

1

2

3 0

1

2

3 0

1

2

3

quarter

Post-test results Import postTest % mutate_each(funs(ifelse(. == "true", 1, 0)), -iterationName, -gameid, -time, -label, -finalRole, -eventType) %>% gather(object, meaning, blueSquare1:stripedCircle2) %>% mutate(blue = grepl('blue', object), red = grepl('red', object), striped = grepl('striped', object), spotted = grepl('spotted', object), circle = grepl("Circle", object), square = grepl("Square", object)) ## Parsed with column specification: ## cols( ## iterationName = col_character(), ## gameid = col_character(), ## time = col_double(), ## blueSquare1 = col_character(), ## blueSquare2 = col_character(), ## redSquare1 = col_character(), ## redSquare2 = col_character(), ## spottedCircle1 = col_character(), ## spottedCircle2 = col_character(),

6

## ## ## ## ## ## )

stripedCircle1 = col_character(), stripedCircle2 = col_character(), label = col_character(), finalRole = col_character(), eventType = col_character()

## `mutate_each()` is deprecated. ## Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead. ## To map `funs` over a selection of variables, use `mutate_at()`

‘Validate’ meanings against trajectory data How do these post-test ratings compare to ‘situtated’ language use? Do players actually use the label during the game to mean the thing they explicitly say it means at the end?

Distribution of meanings How many objects does each label correspond to (i.e. how many meanings at sub-level vs. basic-level vs. super-level) postTest %>% group_by(gameid, finalRole, label) %>% summarize(numObjects = sum(meaning)) %>% left_join(d) %>% ggplot(aes(x = numObjects, y = ..density..)) + geom_histogram() + facet_wrap(~ condition) + theme_few() ## Joining, by = "gameid" ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7

basic−majority

sub−majority

super−majority

uniform

4 3 2

density

1 0 4 3 2 1 0 0

1

2

3

4

0

1

numObjects How often do players align on meanings? On average, pairs only match on about 30% of the meanings they mark. . . postTest %>% select(-time) %>% spread(finalRole, meaning) %>% filter(listener > 0 | speaker > 0) %>% group_by(gameid, label) %>% summarize(match = all(listener == speaker)) %>% group_by(gameid) %>% summarize(numMatching = sum(match) / length(match)) %>% left_join(d) %>% ggplot(aes(x = numMatching, y = ..density..)) + geom_histogram(binwidth = .2) + geom_vline(aes(xintercept = mean(numMatching))) + xlim(0,1) + theme_few() + xlab('% matching') ## Joining, by = "gameid"

8

2

3

4

density

1.5

1.0

0.5

0.0 0.00

0.25

0.50

% matching # facet_wrap(~ condition) Unsurprisingly, pairs that aligned on meanings better performed better. . . postTest %>% select(-time) %>% spread(finalRole, meaning) %>% filter(listener > 0 | speaker > 0) %>% group_by(gameid, label) %>% summarize(match = all(listener == speaker)) %>% group_by(gameid) %>% summarize(numMatching = sum(match) / 16) %>% left_join(d) %>% group_by(gameid) %>% summarize(numMatching = mean(numMatching), overallAcc = mean(overallAcc)) %>% ggplot(aes(x = numMatching, y = overallAcc)) + geom_point() + theme_few() + geom_smooth(method = 'lm') ## Joining, by = "gameid"

9

0.75

1.00

1.0

overallAcc

0.8

0.6

0.4

0.2 0.0

0.2

0.4

numMatching But note that pairs that didn’t technically align that well could still perform pretty well if one partner simply has a stricter meaning than the other but the difference is never relevant. When people fail to perfectly align, do they do so in a predictable way? (e.g. one meaning a subset of the other?)

Vocab size by condition. . . postTest %>% group_by(gameid, finalRole, label) %>% summarize(numObjects = sum(meaning)) %>% filter(numObjects > 0) %>% group_by(gameid, finalRole) %>% tally() %>% right_join(d) %>% group_by(condition) %>% summarize(vocabSize = mean(n, na.rm = T)) ## Joining, by = "gameid" ## ## ## ## ## ## ##

# A tibble: 4 x 2 condition vocabSize 1 basic-majority 8.200000 2 sub-majority 9.083333 3 super-majority 7.500000 4 uniform 8.932203

10

Do basic-level & sub-level coexist? postTest %>% group_by(gameid, finalRole, label) %>% filter(meaning == 1) %>% summarize(subordinate = sum(meaning) == 1, basic = (sum(meaning) == 2 & (all(red) | all(blue) | all(striped) | all(spotted)))) %>% group_by(gameid, finalRole) %>% summarize(numSub = sum(subordinate), numBasic = sum(basic)) %>% left_join(d) %>% filter(condition %in% c('uniform', 'sub-majority', 'basic-majority')) %>% group_by(gameid, finalRole, condition) %>% summarize(numSub = mean(numSub), numBasic=mean(numBasic)) %>% ggplot(aes(x = numSub, y = numBasic)) +#, color = numSub > 0 & numBasic > 0)) + geom_jitter(width = .2, height = .2, size = 3) + facet_grid(~ condition) + theme_few() + xlab("# subordinate meanings") + ylab("# basic meanings") + theme(aspect.ratio=1) ## Joining, by = "gameid"

# basic meanings

basic−majority

sub−majority

uniform

6

4

2

0 0

5

10

15

0

5

10

15

# subordinate meanings #guides(color=FALSE) ggsave("../../writing/evolang18/result.png") ## Saving 6.5 x 4.5 in image # postTest %>% # group_by(gameid, finalRole, label) %>% # summarize(subordinate = sum(meaning) == 1, # basic = Next, can just look at means. . .

11

0

5

10

15

postTest %>% group_by(gameid, finalRole, label) %>% filter(meaning == 1) %>% summarize(subordinate = sum(meaning) == 1, basic = (sum(meaning) == 2 & (all(red) | all(blue) | all(striped) | all(spotted)))) %>% group_by(gameid, finalRole) %>% summarize(numSub = sum(subordinate), numBasic = sum(basic)) %>% left_join(d) %>% filter(condition %in% c('uniform', 'sub-majority', 'basic-majority')) %>% filter(overallAcc > .5) %>% group_by(gameid, finalRole, condition) %>% summarize(numSub = mean(numSub), numBasic=mean(numBasic)) %>% group_by(condition) %>% summarize(sub = mean(numSub), # suber = sd(numSub)/sqrt(length(numSub)), basic = mean(numBasic)) %>%#, basicer = sd(numBasic)/sqrt(length(numBasic))) %>% ggplot(aes(x = sub, y = basic)) + geom_point(size = 3) + #geom_text(aes(label = condition), nudge_y = -.1, nudge_x = .2, ) + theme_few(20) + ylim(0, 4) + xlim(0, 10) + xlab("mean # subordinate-level labels") + ylab("mean # basic-level labels") + theme(aspect.ratio = 1) ## Joining, by = "gameid"

12

mean # basic−level labels

4

3

2

1

0 0.0

2.5

5.0

7.5

10.0

mean # subordinate−level labels Some questions 1. Is this data ‘valid’ & reliable? Does it allow us to systematically examine the real lexica that are formed? Or is it ‘too hard’ for many turkers, according to some criterion, so that we’re mostly just seeing noise? Collect more data to estimate this better? Redesign task (e.g. only draw targets from half of heirarchy; make it much longer?) 2. Do they use words appropriately (i.e. not just to literally mean what they say it means but also in context where it’s useful)? 3. What is confusion matrix of objects for diff. words? May see clustering within categories if participants haven’t yet aligned on 4. What is sequence of formation (first occurrence: basic-level, subordinate?) 5. Pragmatic effects (e.g. words on subordinate trials being extended to basic-level) 6. Do people align better when the play longer? 7. What stats to use for coexistence of basic- and sub-?

13