From 2ce2c056c8e17aa479e78cd6b6285546e2f28268 Mon Sep 17 00:00:00 2001
From: chrysanthopoulou <vm255@stud.uni-heidelberg.de>
Date: Thu, 29 Feb 2024 17:44:53 +0100
Subject: [PATCH] Update some installation matters and metrics readme

---
 metrics/README.md                           |  48 ++++++++--
 metrics/general_execution_script.sh         |   8 +-
 metrics/grade/install_grade.sh              |   1 +
 metrics/gruen/README.md                     | 101 ++++++++++++++++++++
 metrics/gruen/install.sh                    |   4 +-
 metrics/requirements/grade_requirements.txt |   4 +-
 metrics/virtual_environment_creator.sh      |   6 +-
 metrics/virtual_environment_populator.sh    |  47 +++++++++
 metrics/virtual_environment_populator.txt   |   3 +
 9 files changed, 207 insertions(+), 15 deletions(-)
 create mode 100644 metrics/gruen/README.md
 create mode 100644 metrics/virtual_environment_populator.sh
 create mode 100644 metrics/virtual_environment_populator.txt

diff --git a/metrics/README.md b/metrics/README.md
index 537ec25..541261f 100644
--- a/metrics/README.md
+++ b/metrics/README.md
@@ -1,10 +1,44 @@
-# frankenstein metrics
+# Metrics
 
-virtual environments required:
-    - ./preprocessing_data/preprocessing_venv
-    - ./grade/grade_venv_3
-    - ./deam/deam_venv_2
-    - ./gruen/gruen_venv_1
-    - ./S3BERT/s3bert_venv_2
+<!-- virtual environments required: 
+    ./preprocessing_data/preprocessing_venv
+    ./grade/grade_venv_3
+    ./deam/deam_venv_2
+    ./gruen/gruen_venv_1
+    ./S3BERT/s3bert_venv_2
+-->
 
+This is the Metrics section of our project. 
+In order to run the metrics we want to on our dialogues, we need to set up separate virtual environments for each of them. <br>
+To set up the environments on the CoLi Cluster (to be adapted to your workspace) first you need to run: <br> <br>
+**sh virtual_environment_creator.sh** <br> <br> 
+in your terminal.
+Then, after that process has run through, run <br><br>
+**sbatch virtual_environment_populator.sh** <br><br>
+in your terminal.
 
+Then download the models from the different repositories of the metrics by following the instructions in the respective readmes. Then run the <br> <br>
+**general_execution_script.sh**
+<br><br>
+
+In the **general_execution_script.sh** you can specify which dialogue dataset you want analysed by setting the DATAFRAME_PATH="example_file.csv", to the file you want. 
+
+<!--S3BERT setup
+
+cd S3BERT
+
+download models
+wget https://www.cl.uni-heidelberg.de/~opitz/data/s3bert_all-mpnet-base-v2.tar.gz
+wget https://www.cl.uni-heidelberg.de/~opitz/data/s3bert_all-MiniLM-L12-v2.tar.gz
+
+unpack in src 
+tar -xvzf s3bert_all-mpnet-base-v2.tar.gz -C src/
+tar -xvzf s3bert_all-MiniLM-L12-v2.tar.gz -C src/
+
+remove zip files
+rm -rf s3bert_all-mpnet-base-v2.tar.gz
+rm -rf s3bert_all-MiniLM-L12-v2.tar.gz
+
+GRUEN setup
+
+cd gruen -->
diff --git a/metrics/general_execution_script.sh b/metrics/general_execution_script.sh
index baf6be8..794043c 100644
--- a/metrics/general_execution_script.sh
+++ b/metrics/general_execution_script.sh
@@ -28,7 +28,7 @@ cd ..
 
 
 # grade
-source ./grade/grade_venv_3/bin/activate
+source ./grade/grade_venv/bin/activate
 print_status
 cd grade/GRADE/script/
 sbatch --dependency=afterok:$JOB1 execute_w_gpu.sh
@@ -37,7 +37,7 @@ cd ../../..
 print_status
 
 # deam
-source ./deam/deam_venv_2/bin/activate
+source ./deam/deam_venv/bin/activate
 print_status
 cd deam/
 sbatch --dependency=afterok:$JOB1 execute_w_gpu.sh
@@ -46,7 +46,7 @@ cd ..
 print_status
 
 # gruen
-source ./gruen/gruen_venv_1/bin/activate
+source ./gruen/gruen_venv/bin/activate
 print_status
 cd gruen
 sbatch run_gruen.sh
@@ -55,7 +55,7 @@ cd ..
 
 
 # S3BERT
-source ./S3BERT/s3bert_venv_2/bin/activate
+source ./S3BERT/s3bert_venv/bin/activate
 print_status
 cd S3BERT/src
 sbatch execute_w_gpu.sh
diff --git a/metrics/grade/install_grade.sh b/metrics/grade/install_grade.sh
index 0c5368a..ebb24ba 100644
--- a/metrics/grade/install_grade.sh
+++ b/metrics/grade/install_grade.sh
@@ -10,3 +10,4 @@ rm $DIALOG_NAME
 #1CaRhHnO0YsQHOnJsmMUJuL4w9HXJZQYw tools id
 #1v9o-fSohFDegicakrSEnKNcKliOqhYfH grade checkpoint
 
+#wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt
diff --git a/metrics/gruen/README.md b/metrics/gruen/README.md
new file mode 100644
index 0000000..bae8582
--- /dev/null
+++ b/metrics/gruen/README.md
@@ -0,0 +1,101 @@
+![Python 3](https://img.shields.io/badge/python-3-green.svg)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+# GRUEN for Evaluating Linguistic Quality of Generated Text
+
+This repo is the GRUEN metric implementation of [GRUEN for Evaluating Linguistic Quality of Generated Text](https://arxiv.org/pdf/2010.02498.pdf) (Findings of EMNLP 2020). 
+
+
+## Table of Contents
+- [Introduction](#Introduction)
+- [Code](#Code)
+- [Dataset](#Dataset)
+- [Related Papers](#Related-Papers)
+- [Citation](#Citation)
+
+
+## Introduction
+GRUEN aims at evaluating the __linguistic quality__ of generated text from machine learning models.
+Specifically, it aims to capture the four linguistic dimensions in Table 1. 
+
+<p align="center"><img width="50%" src="linguistic_aspects.png"/></p>
+
+
+GRUEN has been shown to correlate well with human judgments on 13 datasets over the five natural language generation tasks below: 
+- Abstractive Text Summarization
+- Machine Translation
+- Dialogue System
+- Text Simplification
+- Text Compression
+
+
+
+## Code:
+The code is based on Python 3. 
+1) Install the dependencies as below: 
+    ```
+    pip install -r requirements.txt
+    ```
+    or using conda environment file:
+    ```
+    conda env create --file environment.yml
+    ```
+
+2) Use shell inscript to download CoLa models.  
+```chmod u+x install.sh & ./install.sh```
+
+3) Run main.py for an example usage.   
+  ```python -m main```  
+
+
+
+## Dataset
+It is critically important to collect human judgments (_i.e._, the manual linguistic quality annotation score) of the system output.  
+
+To ease future research on proposing novel evaluation metrics, we summarize some benchmark datasets below. 
+For license issues, we are unable to provide links for downloading the data and the human judgments. 
+We, however, point out how you can access them. 
+ 
+
+__Abstractive Text Summarization__:
+- _CNN/Daily Mail_: The dataset is originally proposed by [Hermann et al. (2015)](https://papers.nips.cc/paper/5945-teaching-machines-to-read-and-comprehend.pdf) and [Nallapati et al. (2016)](https://www.aclweb.org/anthology/K16-1028.pdf). The human judgments are collected by [Chaganty et al. (2018)](https://www.aclweb.org/anthology/P18-1060.pdf). 
+- _TAC-2011_: Please refer to [the link here](https://tac.nist.gov/data/past/2011/Summ11.html).
+- _DUC2005, DUC2006, DUC2007_: Please refer to [the link here](https://www-nlpir.nist.gov/projects/duc/data.html). 
+
+__Machine Translation__: 
+- _WMT16_: Please refer to [the link here](http://www.statmt.org/wmt16/). It has six human annotated datasets (_i.e._, cs-en, de-en, fi-en, ro-en, ru-en, tr-en). 
+
+__Dialogue System__:
+- _BAGEL_: The dataset is originally proposed by [Mairesse et al. (2010)](https://www.aclweb.org/anthology/P10-1157.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf).
+- _SFHOTEL_: The dataset is originally proposed by [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf).
+- _SFREST_: The dataset is originally proposed by [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf). The human judgments are collected by [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf).
+
+__Text Simplification__:
+- _[Xu et al. (2016)](https://www.aclweb.org/anthology/Q16-1029.pdf)_: The dataset is available [here](https://github.com/cocoxu/simplification/). Please email the first author to ask for the human judgments.
+
+__Text Compression__:
+- _[Toutanova et al. (2016)](https://www.aclweb.org/anthology/D16-1033.pdf)_: Please refer to the [paper](https://www.aclweb.org/anthology/D16-1033.pdf). 
+
+
+## Related Papers
+- [Dang (2006)](https://duc.nist.gov/pubs/2006papers/duc2006.pdf): Overview of DUC 2006 (Document Understanding Conference 2006)
+- [Hermann et al. (2015)](https://papers.nips.cc/paper/5945-teaching-machines-to-read-and-comprehend.pdf): Teaching machines to read and comprehend (NIPS 2015)
+- [Nallapati et al. (2016)](https://www.aclweb.org/anthology/K16-1028.pdf): Abstractive text summarization using sequence-to-sequence RNNs and beyond (CoNLL 2016)
+- [Chaganty et al. (2018)](https://www.aclweb.org/anthology/P18-1060.pdf): The price of debiasing automatic metrics in natural language evaluation (ACL 2018)
+- [Mairesse et al. (2010)](https://www.aclweb.org/anthology/P10-1157.pdf): Phrase-based statistical language generation using graphical models and active learning (ACL 2010)
+- [Wen et al. (2015)](https://www.aclweb.org/anthology/D15-1199.pdf): Semantically conditioned LSTM-based natural language generation for spoken dialogue systems (EMNLP 2015)
+- [Novikova et al. (2017)](https://www.aclweb.org/anthology/D17-1238.pdf): Why we need new evaluation metrics for NLG (EMNLP 2017)
+- [Xu et al. (2016)](https://www.aclweb.org/anthology/Q16-1029.pdf): Optimizing statistical machine translation for text simplification (TACL 2016)
+- [Toutanova et al. (2016)](https://www.aclweb.org/anthology/D16-1033.pdf): A dataset and evaluation metrics for abstractive compression of sentences and short paragraphs (EMNLP 2016) 
+
+
+## Citation
+If you find this repo useful, please cite:
+```bibtex
+@inproceedings{zhu2020gruen,
+  title={GRUEN for Evaluating Linguistic Quality of Generated Text},
+  author={Zhu, Wanzheng and Bhat, Suma},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings},
+  pages={94--108},
+  year={2020}
+}
+```
\ No newline at end of file
diff --git a/metrics/gruen/install.sh b/metrics/gruen/install.sh
index 559335b..ff3f957 100755
--- a/metrics/gruen/install.sh
+++ b/metrics/gruen/install.sh
@@ -3,4 +3,6 @@ unzip cola_model.zip
 python -m spacy download en_core_web_md
 
 # instead of their gdown --> wget
-wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt
\ No newline at end of file
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6" -O cola_model.zip && rm -rf /tmp/cookies.txt
+
+wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Hw5na_Iy4-kGEoX60bD8vXYeJDQrzyj6' -O cola_model.zip
diff --git a/metrics/requirements/grade_requirements.txt b/metrics/requirements/grade_requirements.txt
index ea7b91f..0237a9e 100644
--- a/metrics/requirements/grade_requirements.txt
+++ b/metrics/requirements/grade_requirements.txt
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e45c3909759f93698b22fd01275bb8f065b5b1b65823ce0dbb112f0ea1c7e82
-size 1872
+oid sha256:cda7729b9d9bd02117b5c85b9b3375ce97a4f224f00f0f430ad8819b00f9ca1c
+size 1787
diff --git a/metrics/virtual_environment_creator.sh b/metrics/virtual_environment_creator.sh
index 9ce58df..8df186c 100644
--- a/metrics/virtual_environment_creator.sh
+++ b/metrics/virtual_environment_creator.sh
@@ -1 +1,5 @@
-python3 -m venv trial_venv_7 || { echo "Failed to create virtual environment"; exit 1; }
\ No newline at end of file
+python3 -m venv ./preprocessing_data/preprocessing_venv || { echo "Failed to create virtual environment"; exit 1; }
+python3 -m venv ./grade/grade_venv || { echo "Failed to create virtual environment"; exit 1; }
+python3 -m venv ./deam/deam_venv || { echo "Failed to create virtual environment"; exit 1; }
+python3 -m venv ./gruen/gruen_venv || { echo "Failed to create virtual environment"; exit 1; }
+python3 -m venv ./S3BERT/s3bert_venv || { echo "Failed to create virtual environment"; exit 1; }
diff --git a/metrics/virtual_environment_populator.sh b/metrics/virtual_environment_populator.sh
new file mode 100644
index 0000000..78c00a5
--- /dev/null
+++ b/metrics/virtual_environment_populator.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#SBATCH --job-name=virtual_environment_populator
+#SBATCH --output=virtual_environment_populator.txt
+#SBATCH --mail-user=chrysanthopoulou@cl.uni-heidelberg.de
+#SBATCH --mail-type=ALL
+
+
+./preprocessing_data/preprocessing_venv 
+./grade/grade_venv
+./deam/deam_venv
+./gruen/gruen_venv
+./S3BERT/s3bert_venv
+
+# preprocessing_venv 
+echo preprocessing
+source ./preprocessing_data/preprocessing_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; }
+pip install -r ./requirements/preprocessing_requirements.txt || { echo "Failed to install requirements"; exit 1; } 
+echo "success"
+deactivate
+
+#grade_venv
+echo grade
+source ./grade/grade_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; }
+pip install -r ./requirements/grade_requirements.txt || { echo "Failed to install requirements"; exit 1; } 
+echo "success"
+deactivate
+
+#deam_venv
+echo deam
+source ./deam/deam_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; }
+pip install -r ./requirements/deam_requirements.txt || { echo "Failed to install requirements"; exit 1; } 
+echo "success"
+deactivate
+
+#gruen_venv
+echo gruen
+source ./gruen/gruen_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; }
+pip install -r ./requirements/gruen_requirements.txt || { echo "Failed to install requirements"; exit 1; } 
+echo "success"
+deactivate
+
+#s3bert_venv
+echo s3bert
+source ./S3BERT/s3bert_venv/bin/activate || { echo "Failed to activate virtual environment"; exit 1; }
+pip install -r ./requirements/s3bert_requirements.txt || { echo "Failed to install requirements"; exit 1; } 
+echo "success"
+deactivate
\ No newline at end of file
diff --git a/metrics/virtual_environment_populator.txt b/metrics/virtual_environment_populator.txt
new file mode 100644
index 0000000..68e85f1
--- /dev/null
+++ b/metrics/virtual_environment_populator.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7e8d37f18e29560ae31b0be6243178f270fd8b61935e9e7a3aa6251872f500e
+size 47211
-- 
GitLab