From 2ce2c056c8e17aa479e78cd6b6285546e2f28268 Mon Sep 17 00:00:00 2001 From: chrysanthopoulou <vm255@stud.uni-heidelberg.de> Date: Thu, 29 Feb 2024 17:44:53 +0100 Subject: [PATCH] Update some installation matters and metrics readme --- metrics/README.md | 48 ++++++++-- metrics/general_execution_script.sh | 8 +- metrics/grade/install_grade.sh | 1 + metrics/gruen/README.md | 101 ++++++++++++++++++++ metrics/gruen/install.sh | 4 +- metrics/requirements/grade_requirements.txt | 4 +- metrics/virtual_environment_creator.sh | 6 +- metrics/virtual_environment_populator.sh | 47 +++++++++ metrics/virtual_environment_populator.txt | 3 + 9 files changed, 207 insertions(+), 15 deletions(-) create mode 100644 metrics/gruen/README.md create mode 100644 metrics/virtual_environment_populator.sh create mode 100644 metrics/virtual_environment_populator.txt diff --git a/metrics/README.md b/metrics/README.md index 537ec25..541261f 100644 --- a/metrics/README.md +++ b/metrics/README.md @@ -1,10 +1,44 @@ -# frankenstein metrics +# Metrics -virtual environments required: - - ./preprocessing_data/preprocessing_venv - - ./grade/grade_venv_3 - - ./deam/deam_venv_2 - - ./gruen/gruen_venv_1 - - ./S3BERT/s3bert_venv_2 +<!-- virtual environments required: + ./preprocessing_data/preprocessing_venv + ./grade/grade_venv_3 + ./deam/deam_venv_2 + ./gruen/gruen_venv_1 + ./S3BERT/s3bert_venv_2 +--> +This is the Metrics section of our project. +In order to run the metrics we want to on our dialogues, we need to set up separate virtual environments for each of them. <br> +To set up the environments on the CoLi Cluster (to be adapted to your workspace) first you need to run: <br> <br> +**sh virtual_environment_creator.sh** <br> <br> +in your terminal. +Then, after that process has run through, run <br><br> +**sbatch virtual_environment_populator.sh** <br><br> +in your terminal. +Then download the models from the different repositories of the metrics by following the instructions in the respective readmes. Then run the <br> <br> +**general_execution_script.sh** +<br><br> + +In the **general_execution_script.sh** you can specify which dialogue dataset you want analysed by setting the DATAFRAME_PATH="example_file.csv", to the file you want. + +<!--S3BERT setup + +cd S3BERT + +download models +wget https://www.cl.uni-heidelberg.de/~opitz/data/s3bert_all-mpnet-base-v2.tar.gz +wget https://www.cl.uni-heidelberg.de/~opitz/data/s3bert_all-MiniLM-L12-v2.tar.gz + +unpack in src +tar -xvzf s3bert_all-mpnet-base-v2.tar.gz -C src/ +tar -xvzf s3bert_all-MiniLM-L12-v2.tar.gz -C src/ + +remove zip files +rm -rf s3bert_all-mpnet-base-v2.tar.gz +rm -rf s3bert_all-MiniLM-L12-v2.tar.gz + +GRUEN setup + +cd gruen --> diff --git a/metrics/general_execution_script.sh b/metrics/general_execution_script.sh index baf6be8..794043c 100644 --- a/metrics/general_execution_script.sh +++ b/metrics/general_execution_script.sh @@ -28,7 +28,7 @@ cd .. # grade -source ./grade/grade_venv_3/bin/activate +source ./grade/grade_venv/bin/activate print_status cd grade/GRADE/script/ sbatch --dependency=afterok:$JOB1 execute_w_gpu.sh @@ -37,7 +37,7 @@ cd ../../.. print_status # deam -source ./deam/deam_venv_2/bin/activate +source ./deam/deam_venv/bin/activate print_status cd deam/ sbatch --dependency=afterok:$JOB1 execute_w_gpu.sh @@ -46,7 +46,7 @@ cd .. print_status # gruen -source ./gruen/gruen_venv_1/bin/activate +source ./gruen/gruen_venv/bin/activate print_status cd gruen sbatch run_gruen.sh @@ -55,7 +55,7 @@ cd .. # S3BERT -source ./S3BERT/s3bert_venv_2/bin/activate +source ./S3BERT/s3bert_venv/bin/activate print_status cd S3BERT/src sbatch execute_w_gpu.sh diff --git a/metrics/grade/install_grade.sh b/metrics/grade/install_grade.sh index 0c5368a..ebb24ba 100644 --- a/metrics/grade/install_grade.sh +++ b/metrics/grade/install_grade.sh @@ -10,3 +10,4 @@ rm $DIALOG_NAME #1CaRhHnO0YsQHOnJsmMUJuL4w9HXJZQYw tools id #1v9o-fSohFDegicakrSEnKNcKliOqhYfH grade checkpoint +#wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt diff --git a/metrics/gruen/README.md b/metrics/gruen/README.md new file mode 100644 index 0000000..bae8582 --- /dev/null +++ b/metrics/gruen/README.md @@ -0,0 +1,101 @@ + +[](https://opensource.org/licenses/MIT) +# GRUEN for Evaluating Linguistic Quality of Generated Text + +This repo is the GRUEN metric implementation of [GRUEN for Evaluating Linguistic Quality of Generated Text](https://arxiv.org/pdf/2010.02498.pdf) (Findings of EMNLP 2020). + + +## Table of Contents +- [Introduction](#Introduction) +- [Code](#Code) +- [Dataset](#Dataset) +- [Related Papers](#Related-Papers) +- [Citation](#Citation) + + +## Introduction +GRUEN aims at evaluating the __linguistic quality__ of generated text from machine learning models. +Specifically, it aims to capture the four linguistic dimensions in Table 1. + +<p align="center"><img width="50%" src="linguistic_aspects.png"/></p> + + +GRUEN has been shown to correlate well with human judgments on 13 datasets over the five natural language generation tasks below: +- Abstractive Text Summarization +- Machine Translation +- Dialogue System +- Text Simplification +- Text Compression + + + +## Code: +The code is based on Python 3. +1) Install the dependencies as below: + ``` + pip install -r requirements.txt + ``` + or using conda environment file: + ``` + conda env create --file environment.yml + ``` + +2) Use shell inscript to download CoLa models. +```chmod u+x install.sh & ./install.sh``` + +3) Run main.py for an example usage. + ```python -m main``` + + + +## Dataset +It is critically important to collect human judgments (_i.e._, the manual linguistic quality annotation score) of the system output. + +To ease future research on proposing novel evaluation metrics, we summarize some benchmark datasets below. +For license issues, we are unable to provide links for downloading the data and the human judgments. +We, however, point out how you can access them. + + +__Abstractive Text Summarization__: +- _CNN/Daily Mail_: The dataset is originally proposed by [Hermann et al. (2015)](https://papers.nips.cc/paper/5945-teaching-machines-to-read-and-comprehend.pdf) and [Nallapati et al. (2016)](https://www.aclweb.org/anthology/K16-1028.pdf). The human judgments are collected by [Chaganty et al. (2018)](https://www.aclweb.org/anthology/P18-1060.pdf). +- _TAC-2011_: Please refer to [the link here](https://tac.nist.gov/data/past/2011/Summ11.html). +- _DUC2005, DUC2006, DUC2007_: Please refer to [the link here](https://www-nlpir.nist.gov/projects/duc/data.html). + +__Machine Translation__: +- _WMT16_: Please refer to [the link here](http://www.statmt.org/wmt16/). It has six human annotated datasets (_i.e._, cs-en, de-en, fi-en, ro-en, ru-en, tr-en). + +__Dialogue System__: +- _BAGEL_: The dataset is originally proposed by [Mairesse et al. (2010)](https://www.aclweb.org/anthology/P10-1157.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf). +- _SFHOTEL_: The dataset is originally proposed by [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf). +- _SFREST_: The dataset is originally proposed by [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf). + +__Text Simplification__: +- _[Xu et al. (2016)](https://www.aclweb.org/anthology/Q16-1029.pdf)_: The dataset is available [here](https://github.com/cocoxu/simplification/). Please email the first author to ask for the human judgments. + +__Text Compression__: +- _[Toutanova et al. (2016)](https://www.aclweb.org/anthology/D16-1033.pdf)_: Please refer to the [paper](https://www.aclweb.org/anthology/D16-1033.pdf). + + +## Related Papers +- [Dang (2006)](https://duc.nist.gov/pubs/2006papers/duc2006.pdf): Overview of DUC 2006 (Document Understanding Conference 2006) +- [Hermann et al. (2015)](https://papers.nips.cc/paper/5945-teaching-machines-to-read-and-comprehend.pdf): Teaching machines to read and comprehend (NIPS 2015) +- [Nallapati et al. (2016)](https://www.aclweb.org/anthology/K16-1028.pdf): Abstractive text summarization using sequence-to-sequence RNNs and beyond (CoNLL 2016) +- [Chaganty et al. (2018)](https://www.aclweb.org/anthology/P18-1060.pdf): The price of debiasing automatic metrics in natural language evaluation (ACL 2018) +- [Mairesse et al. (2010)](https://www.aclweb.org/anthology/P10-1157.pdf): Phrase-based statistical language generation using graphical models and active learning (ACL 2010) +- [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf): Semantically conditioned LSTM-based natural language generation for spoken dialogue systems (EMNLP 2015) +- [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf): Why we need new evaluation metrics for NLG (EMNLP 2017) +- [Xu et al. (2016)](https://www.aclweb.org/anthology/Q16-1029.pdf): Optimizing statistical machine translation for text simplification (TACL 2016) +- [Toutanova et al. (2016)](https://www.aclweb.org/anthology/D16-1033.pdf): A dataset and evaluation metrics for abstractive compression of sentences and short paragraphs (EMNLP 2016) + + +## Citation +If you find this repo useful, please cite: +```bibtex +@inproceedings{zhu2020gruen, + title={GRUEN for Evaluating Linguistic Quality of Generated Text}, + author={Zhu, Wanzheng and Bhat, Suma}, + booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings}, + pages={94--108}, + year={2020} +} +``` \ No newline at end of file diff --git a/metrics/gruen/install.sh b/metrics/gruen/install.sh index 559335b..ff3f957 100755 --- a/metrics/gruen/install.sh +++ b/metrics/gruen/install.sh @@ -3,4 +3,6 @@ unzip cola_model.zip python -m spacy download en_core_web_md # instead of their gdown --> wget -wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt \ No newline at end of file +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt + +wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O cola_model.zip diff --git a/metrics/requirements/grade_requirements.txt b/metrics/requirements/grade_requirements.txt index ea7b91f..0237a9e 100644 --- a/metrics/requirements/grade_requirements.txt +++ b/metrics/requirements/grade_requirements.txt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e45c3909759f93698b22fd01275bb8f065b5b1b65823ce0dbb112f0ea1c7e82 -size 1872 +oid sha256:cda7729b9d9bd02117b5c85b9b3375ce97a4f224f00f0f430ad8819b00f9ca1c +size 1787 diff --git a/metrics/virtual_environment_creator.sh b/metrics/virtual_environment_creator.sh index 9ce58df..8df186c 100644 --- a/metrics/virtual_environment_creator.sh +++ b/metrics/virtual_environment_creator.sh @@ -1 +1,5 @@ -python3 -m venv trial_venv_7 || { echo "Failed to create virtual environment"; exit 1; } \ No newline at end of file +python3 -m venv ./preprocessing_data/preprocessing_venv || { echo "Failed to create virtual environment"; exit 1; } +python3 -m venv ./grade/grade_venv || { echo "Failed to create virtual environment"; exit 1; } +python3 -m venv ./deam/deam_venv || { echo "Failed to create virtual environment"; exit 1; } +python3 -m venv ./gruen/gruen_venv || { echo "Failed to create virtual environment"; exit 1; } +python3 -m venv ./S3BERT/s3bert_venv || { echo "Failed to create virtual environment"; exit 1; } diff --git a/metrics/virtual_environment_populator.sh b/metrics/virtual_environment_populator.sh new file mode 100644 index 0000000..78c00a5 --- /dev/null +++ b/metrics/virtual_environment_populator.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH --job-name=virtual_environment_populator +#SBATCH --output=virtual_environment_populator.txt +#SBATCH --mail-user=chrysanthopoulou@cl.uni-heidelberg.de +#SBATCH --mail-type=ALL + + +./preprocessing_data/preprocessing_venv +./grade/grade_venv +./deam/deam_venv +./gruen/gruen_venv +./S3BERT/s3bert_venv + +# preprocessing_venv +echo preprocessing +source ./preprocessing_data/preprocessing_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; } +pip install -r ./requirements/preprocessing_requirements.txt || { echo "Failed to install requirements"; exit 1; } +echo "success" +deactivate + +#grade_venv +echo grade +source ./grade/grade_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; } +pip install -r ./requirements/grade_requirements.txt || { echo "Failed to install requirements"; exit 1; } +echo "success" +deactivate + +#deam_venv +echo deam +source ./deam/deam_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; } +pip install -r ./requirements/deam_requirements.txt || { echo "Failed to install requirements"; exit 1; } +echo "success" +deactivate + +#gruen_venv +echo gruen +source ./gruen/gruen_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; } +pip install -r ./requirements/gruen_requirements.txt || { echo "Failed to install requirements"; exit 1; } +echo "success" +deactivate + +#s3bert_venv +echo s3bert +source ./S3BERT/s3bert_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; } +pip install -r ./requirements/s3bert_requirements.txt || { echo "Failed to install requirements"; exit 1; } +echo "success" +deactivate \ No newline at end of file diff --git a/metrics/virtual_environment_populator.txt b/metrics/virtual_environment_populator.txt new file mode 100644 index 0000000..68e85f1 --- /dev/null +++ b/metrics/virtual_environment_populator.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e8d37f18e29560ae31b0be6243178f270fd8b61935e9e7a3aa6251872f500e +size 47211 -- GitLab