Commit 9da4bea0 authored by D.H.D. Nguyen's avatar D.H.D. Nguyen
Browse files

update README

parent 2c26ba18
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
A
B
C
D
E
F
G
H
I
J
+20 −0
Original line number Diff line number Diff line
D	G	E	A
I	F	J	C
B	H	I	E
F	G	A	B
H	J	D	C
A	C	B	G
D	F	H	E
I	J	H	A
E	C	J	D
I	F	B	G
D	C	G	J
A	I	H	B
E	F	J	A
C	I	D	B
H	G	E	F
E	F	C	A
H	G	J	I
D	B	J	I
G	F	A	C
E	H	B	D
+220 −0
Original line number Diff line number Diff line
#!/usr/bin/perl

################################################################################################################################
#  Authors: Svetlana Kiritchenko, Peter Turney
#  Information and Communications Technologies / Technologies de l'information et des communications
#  National Research Council Canada /Conseil national de recherches Canada
#  
#  Description: generates a set of tuples for Best-Worst Scaling annotation
#
#  Usage: generate-BWS-tuples.pl <file-items>
#    <file-items> is a file that contains a list of items to be annotated (one item per line)
#
#  Output: a list of item tuples (one tuple per line; items in a tuple are separated by tab).
#     The output is written into file <file-items>.tuples
#  
#  Version: 1.2
#  Last modified: Sep. 29, 2016
#
#################################################################################################################################

use warnings;
use strict;
use utf8;
binmode (STDIN,  ":utf8");
binmode (STDOUT, ":utf8");

use List::Util qw( shuffle );

#################################################################################################################################
# PARAMETERS
#################################################################################################################################

# number of items per tuple (typically, 4 or 5)
my $items_per_tuple = 4; 

# Best-Worst Scaling factor (typically 1.5 or 2):
#   multiply the number of items in $file_items by this factor
#   in order to determine the number of tuples to generate
my $factor = 2;  

# number of iterations (typically 100 or 1000)
my $num_iter = 100;


#################################################################################################################################

die "Usage: generate-BWS-tuples.pl <file-items>\n" if(@ARGV < 1);

# file with the list of items (terms)
my $file_items = $ARGV[0];

# output file
my $file_output = $file_items.".tuples";

# random number seed (to make results reproducible)
my $rand_seed = 1234;
srand($rand_seed);


# read the input file with the list of items (terms)
print STDERR "Reading $file_items ... \n";

my %unique_items = ();
open(INF, "<$file_items") or die("Cannot open the input file $file_items\n");
while (my $line = <INF>) {
	$line =~ s/[\r\n]+$//;    # remove end-of-line characters

	# check for duplicate items
	if(defined $unique_items{$line}) {
		print STDERR "WARNING: duplicate item ($line); will be included only once.\n";
		next;
	}
	
	$unique_items{$line} = 1;
}
close(INF);

my @items = sort keys %unique_items;
my $num_items = scalar(@items);
my $num_unique_pairs = ($num_items * ($num_items - 1)) / 2;
print STDERR "Read $num_items unique items.\n\n";

# check if the number of unique items is not less than the number of items requested per tuple
if($num_items < $items_per_tuple) {
	print STDERR "ERROR: The number of unique items is less than the number of items requested per tuple\n";
	exit();
}


# generate tuples
my $num_tuples = int(0.5 + ($factor * $num_items));
print STDERR "Generating ".$num_tuples." ".$items_per_tuple."-tuples ...\n";

# try $num_iter different randomizations
print STDERR "Running $num_iter iterations ...\n";

my $best_score;
my @best_tuples = ();

for (my $iter = 1; $iter <= $num_iter; $iter++) {
	print STDERR "iteration $iter\n";

	# generate $num_tuples tuples by randomly sampling without replacement
	my @tuples = ();
	my @ranlist = shuffle(@items);     # make a random list of items
	my %freq_pair = ();
	
	my $j = 0;   # index of the current item in the random list
	for (my $i = 0; $i < $num_tuples; $i++) {
	
		my @tuple = ();   # new tuple
		
		# check if we have enough remained items in the random list to form a new tuple
		if(($j + $items_per_tuple) <= @ranlist) {
		
			# form a new tuple with $items_per_tuple items in the random list starting at index $j
			push(@tuple, @ranlist[$j..$j+$items_per_tuple-1]);
			$j += $items_per_tuple;
			
		} else {   
			# get the rest of the list
			my %items = ();
			my $need_more = $items_per_tuple - scalar(@ranlist) + $j;  # the number of items that we will need to get from a new random list
			for(; $j < @ranlist; $j++) {
				push(@tuple, $ranlist[$j]);
				$items{$ranlist[$j]} = 1;
			}
			
			# generate a new random list of items
			@ranlist = shuffle(@items);
			for($j = 0; $j < $need_more; $j++) {
			
				# if a duplicate item, move it to the end of the list
				while(defined $items{$ranlist[$j]}) {
					my $h = splice(@ranlist, $j, 1);
					push(@ranlist, $h);
				}
				push(@tuple, $ranlist[$j]);
			}			
		}
		
		my $tuple_string = join("\t", @tuple);
		push(@tuples, $tuple_string);

		# add frequencies of pairs of items
		for(my $k1 = 0; $k1 < @tuple; $k1++) {
			for(my $k2 = $k1+1; $k2 < @tuple; $k2++) {
				if($tuple[$k1] lt $tuple[$k2]) {
					$freq_pair{$tuple[$k1]."::".$tuple[$k2]}++;
				} else {
					$freq_pair{$tuple[$k2]."::".$tuple[$k1]}++;
				}
			}
		}
	}	

	# calculate the two-way balance of the set of tuples
	my @freq_pair_values = values %freq_pair;
	my $stddev_pairs = stdev(\@freq_pair_values, $num_unique_pairs);

	# calculate the score for the set and keep the best score and the best set
	my $score = $stddev_pairs;
  
	if (($iter == 1) || ($score < $best_score)) {
		$best_score = $score;
		@best_tuples = @tuples;
	}  
}

# output the best set of tuples to $file_output
print STDERR "\nWriting the best set of tuples to $file_output ...\n";

open(OUTF, ">$file_output") or die("Cannot open the output file $file_output\n");
foreach my $t (@best_tuples) {
  print OUTF "$t\n";
}
close(OUTF);

print STDERR "Finished.\n";



# calculate the standard deviation of a set of values
sub stdev{
	my($x, $n_total) = @_;
	my($n, $m, $sum, $i, $std);
	
	if($n_total == 1) {
		return 0;
	}
	
	$n = scalar(@{$x});

	$m = mean($x, $n_total);
	
	$sum = 0;
	for($i = 0; $i < $n; $i++) {
		$sum += ($m - $x->[$i]) ** 2;
	}
	
	$sum += $m * $m * ($n_total - $n);
	
	$std = sqrt($sum / ($n_total - 1));
	return $std;
}

# calculate the mean of a set of values
sub mean {
	my($x, $n_total) = @_;
	my($n, $i, $sum);
	
	$n = scalar(@{$x});
	
	for($i = 0; $i < $n; $i++) {
		$sum += $x->[$i];
	}
	
	return $sum/$n_total;
}
+181 −0
Original line number Diff line number Diff line
Scripts for Best-Worst Scaling
Version 1.2
25 April 2017
Copyright (C) 2016 National Research Council Canada (NRC)
Contact: Saif Mohammad (saif.mohammad@nrc-cnrc.gc.ca)


*********************************************************************************
Terms of use
*********************************************************************************

1. These scripts can be used freely for research purposes. 
2. If you use the scripts, then please cite the associated papers:

Svetlana Kiritchenko and Saif M. Mohammad (2016) Capturing Reliable Fine-Grained Sentiment Associations by Crowdsourcing and Best-Worst Scaling. Proceedings of the 15th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL), San Diego, California, 2016.

Svetlana Kiritchenko and Saif M. Mohammad (2017) Best-Worst Scaling More Reliable than Rating Scales: A Case Study on Sentiment Intensity Annotation. Proceedings of the Annual Meeting of the Association for Computational Linguistics, Vancouver, Canada, 2017.

3. If interested in commercial use of the scripts, send email to the contact. 
4. If you use the scripts in a product or application, then please credit the authors and NRC appropriately. Also, if you send us an email, we will be thrilled to know about how you have used the scripts.
5. National Research Council Canada (NRC) disclaims any responsibility for the use of the scripts and does not provide technical support. However, the contact listed above will be happy to respond to queries and clarifications.
6. Rather than redistributing the scripts, please direct interested parties to this page:
   http://www.saifmohammad.com/WebPages/BestWorst.html


Please feel free to send us an email:
- with feedback regarding the scripts; 
- with information on how you have used the scripts;
- if interested in having us analyze your data for sentiment, emotion, and other affectual information;
- if interested in a collaborative research project.



*********************************************************************************
General Description
*********************************************************************************

Best–Worst Scaling (BWS), also sometimes referred to as Maximum Difference Scaling (MaxDiff), is an annotation scheme that exploits the comparative approach to annotation (Louviere and Woodworth, 1990; Cohen, 2003; Louviere et al., 2015). Annotators are given k items (k-tuple) and asked which item is the Best (highest in terms of the property of interest) and which is the Worst (lowest in terms of the property of interest). K typically ranges from 4 to 5. These annotations can then be easily converted into real-valued scores of association between the items and the property, which eventually allows for creating a ranked list of items as per their association with the property of interest.

- We have used Best-Worst Scaling to manually annotate words and phrases for sentiment through crowdsourcing. We have shown that ranking of terms by sentiment remains remarkably consistent even when the annotation process is repeated with a different set of annotators. The details of this project can be found in the following paper:

Svetlana Kiritchenko and Saif M. Mohammad (2016) Capturing Reliable Fine-Grained Sentiment Associations by Crowdsourcing and Best-Worst Scaling. Proceedings of the 15th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL), San Diego, California, 2016.

- We have also compared the reliability of annotations produced with BWS and the reliability of annotations obtained with conventional rating scales. We showed that BWS annotations are significantly more reliable, especially when only a limited number of annotations (up to 5N, where N is the number of items to be rated) can be obtained. The details of this study can be found in the following paper:

Svetlana Kiritchenko and Saif M. Mohammad (2017) Best-Worst Scaling More Reliable than Rating Scales: A Case Study on Sentiment Intensity Annotation. Proceedings of the Annual Meeting of the Association for Computational Linguistics, Vancouver, Canada, 2017.

- The sentiment lexicons annotated with Best-Worst Scaling and the papers describing their creation and use can be found at http://www.saifmohammad.com/WebPages/BestWorst.html.



*********************************************************************************
The Current Package
*********************************************************************************

The current package includes three scripts and four example files.

Main scripts:
  - "generate-BWS-tuples.pl" is a Perl script that generates tuples for Best-Worst Scaling annotation from a given list of items/terms;
  - "get-scores-from-BWS-annotations-counting.pl" is a Perl script that converts Best-Worst annotations into real-valued scores of association of items with the property of interest.

Additional scripts:
  - "SHR-BWS.pl" is a Perl script that calculates split-half reliability (SHR) of BWS annotations.


Example files:
  - "example-items.txt" is an example file with a list of items; it can be used as an example input file for "generate-BWS-tuples.pl";
  - "example-items.txt.tuples" is an example output file for "generate-BWS-tuples.pl";
  - "example-tuples-annotations.csv" is an example annotation file; it can be used as an example input file for "get-scores-from-BWS-annotations-counting.pl";
  - "example-scores.txt" is an example output file for "get-scores-from-BWS-annotations-counting.pl".



*********************************************************************************
Script for generating item tuples (generate-BWS-tuples.pl)
*********************************************************************************

This script generates tuples for Best-Worst Scaling annotation from a given list of items/terms. Each tuple has exactly k items, where k is usually 4 or 5 and can be set up by the user. Another parameter that can be set up by the user is the number of tuples to generate. In practice, around 1.5 x N to 2 x N tuples (where N is the total number of items) are sufficient to obtain reliable scores.

The tuples are generated by random sampling and satisfy the following criteria:  
1. no two items within a tuple are identical; 
2. each item in the item list appears approximately in the same number of tuples; 
3. each pair of items appears approximately in the same number of tuples. 

The script generates many sets of tuples and outputs the one that best satisfies the above criteria. The number of iterations can be set up by the user.

All parameters can be set directly in the script. Below is the full list of changeable parameters.


***** PARAMETERS: *****

$items_per_tuple: number of items per tuple (typically, 4 or 5)

$factor: Best-Worst Scaling factor (typically 1.5 or 2); multiply the total number of items by this factor in order to determine the number of tuples to generate

$num_iter: number of iterations (typically 100 or 1000)


***** USAGE: *****

generate-BWS-tuples.pl <file-items>

where <file-items> is a file that contains a list of items to be annotated (one item per line)

Output: a list of item tuples (one tuple per line; items in a tuple are separated by tab). The output is written into file <file-items>.tuples.

Example usage: generate-BWS-tuples.pl example-items.txt
The output file should look similar to example-items.txt.tuples.




*********************************************************************************
Script for calculating scores (get-scores-from-BWS-annotations-counting.pl)
*********************************************************************************

This script converts Best-Worst annotations into real-valued scores of association of items with the property of interest. There are several ways of doing the conversion. This scripts implements the simplest and fastest procedure called Counts Analysis (Orme, 2009): For each item, its score is calculated as the percentage of times the item was chosen as the Best minus the percentage of times the item was chosen as the Worst. The scores range from -1 (least association with the property of interest) to 1 (most association with the property of interest).


***** USAGE: *****

get-scores-from-BWS-annotations-counting.pl <file-annotations>

where <file-annotations> is a CSV (comma delimitted) file with item tuples and Best-Worst annotations. Each line should contain all k items from the tuple, and the items annotated as the Best and as the Worst. For example,

item1,item2,item3,item4,best,worst

File "example-tuples-annotations.csv" shows an example. The annotation file can contain other columns, but they will be ignored. The script assumes that the file has the column names as the first line, and there are columns named "Item1", "Item2", "Item3", "Item4" (for tuple items), and "BestItem", "WorstItem" (for annotations). The user can provide their own column names by modifying the @column_names parameter in the script. Also, there can be more or less than 4 item columns, but the column names for all items should precede the column names for the Best and Worst annotations in @column_names (e.g., "Item1", "Item2", "Item3", "Item4", "Item5", "BestItem", "WorstItem").

Output: items with scores (one item per line). The output is written into STDOUT. 

Example usage: get-scores-from-BWS-annotations-counting.pl example-tuples-annotations.csv
The output should look similar to example-scores.txt.




*********************************************************************************
Script for calculating split-half reliability (SHR-BWS.pl)
*********************************************************************************

This script calculates split-half reliability (SHR) of BWS annotations over a number of trials. SHR is a commonly used approach to determine consistency in psychological studies, that we employ as follows. All annotations for a tuple are randomly split into two halves. Two sets of scores are produced independently from the two halves. Then the correlation between the two sets of scores is calculated. If a method is more reliable, then the correlation of the scores produced
by the two halves will be high.


***** PARAMETERS: *****

$num_trials: number of test trials (typically 100)


***** USAGE: *****

SHR-BWS.pl <file-annotations>

where <file-annotations> is a CSV (comma delimitted) file with item tuples and Best-Worst annotations. Each line should contain all k items from the tuple, and the items annotated as the Best and as the Worst. For example,

item1,item2,item3,item4,best,worst

File "example-tuples-annotations.csv" shows an example. The annotation file can contain other columns, but they will be ignored. The script assumes that the file has the column names as the first line, and there are columns named "Item1", "Item2", "Item3", "Item4" (for tuple items), and "BestItem", "WorstItem" (for annotations). The user can provide their own column names by modifying the @column_names parameter in the script. Also, there can be more or less than 4 item columns, but the column names for all items should precede the column names for the Best and Worst annotations in @column_names (e.g., "Item1", "Item2", "Item3", "Item4", "Item5", "BestItem", "WorstItem").

Output: average Spearman rank correlation and Pearson correlation for the scores obtained from two annotation half-sets. The output is written into STDOUT. 



*********************************************************************************
More Information
*********************************************************************************

Svetlana Kiritchenko and Saif M. Mohammad (2016) Capturing Reliable Fine-Grained Sentiment Associations by Crowdsourcing and Best-Worst Scaling. Proceedings of the 15th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL), San Diego, California, 2016.

Svetlana Kiritchenko and Saif M. Mohammad (2017) Best-Worst Scaling More Reliable than Rating Scales: A Case Study on Sentiment Intensity Annotation. Proceedings of the Annual Meeting of the Association for Computational Linguistics, Vancouver, Canada, 2017.


Steven H. Cohen. (2003) Maximum difference scaling: Improved measures of importance and preference for segmentation. Sawtooth Software, Inc.

Jordan J. Louviere and George G. Woodworth (1990) Best-worst analysis. Working Paper. Department of Marketing and Economic Analysis, University of Alberta.

Jordan J. Louviere, Terry N. Flynn, and A. A. J. Marley (2015) Best-Worst Scaling: Theory, Methods and Applications. Cambridge University Press.

Bryan Orme (2009) Maxdiff analysis: Simple counting, individual-level logit, and HB. Sawtooth Software, Inc.
+6 −1
Original line number Diff line number Diff line
@@ -4,10 +4,15 @@
## Was gibt es?
1. Objekt: Items in Datenbank 
2. **Aufgaben**:
	- Daten aus *Datenbank* extrahieren, um Tupeln zu erzeugen
	- Daten aus *Datenbank* extrahieren und gespeicherte Optionen verwenden, um Tupeln zu erzeugen  
		- **Anmerkung**: am bestens Code in **BWS-generate-tuples** reimplementieren (Skript von Autoren für BWS), da sie nicht nur einfach zufällig alle *factor*x*N* Tupeln einmal erzeugt (nur eine Liste), sondern mehrmals (*n* Epoche) nach Kriterien (*n* Listen), und versucht die Liste, die zu Kriterien am bestens passt, davon als Eingabe für Umfrage zu nehmen. Read more: **BWS-generate-tuples/readme.txt** - Zeilen 74-113
	- gemeinsame Umfrage erstellen
	- Umfrage in *Seite für Annotatoren* einfügen
3. Testdatei für Skript zur Tupeln-Erzeugung:
	- **BWS-generate-tuples/example-items.txt**

## Erwartete(s) Output(s)
1. Umfrage aus Tupeln

## Tools
* python3
Loading