Merge pull request #396 from edx/hassan/acceptance-tests-validation

Fixed acceptance tests validation.

Merge pull request #396 from edx/hassan/acceptance-tests-validation
Fixed acceptance tests validation.
39dbbe00 · Hassan · GitHub · 795694d5 · fda68608 · 39dbbe00
Commit 39dbbe00 authored May 18, 2017 by Hassan Committed by GitHub May 18, 2017
9 changed files
--- a/edx/analytics/tasks/tests/acceptance/__init__.py
+++ b/edx/analytics/tasks/tests/acceptance/__init__.py
@@ -5,6 +5,7 @@ import logging
 import os
 import shutil
 import unittest
+import csv
 from luigi.s3 import S3Client
 import pandas
@@ -112,6 +113,20 @@ def modify_target_for_local_server(target):
        return target
+def coerce_columns_to_string(row):
+    # Vertica response includes datatypes in some columns i-e. datetime, Decimal etc. so convert
+    # them into string before comparison with expected output.
+    return [str(x) for x in row]
+def read_csv_fixture_as_list(fixture_file_path):
+    with open(fixture_file_path) as fixture_file:
+        reader = csv.reader(fixture_file)
+        next(reader)  # skip header
+        fixture_data = list(reader)
+    return fixture_data
 class AcceptanceTestCase(unittest.TestCase):
    acceptance = 1

--- a/edx/analytics/tasks/tests/acceptance/fixtures/output/acceptance_expected_d_user.csv
+++ b/edx/analytics/tasks/tests/acceptance/fixtures/output/acceptance_expected_d_user.csv
@@ -2,4 +2,4 @@ user_id,user_year_of_birth,user_level_of_education,user_gender,user_email,user_u
 1,1984,a,m,honor@example.com,honor,2014-06-27 16:02:38,UNKNOWN
 2,1975,b,m,audit@example.com,audit,2014-06-27 16:02:39,IE
 3,2000,b,,verified@example.com,verified,2014-06-27 16:02:41,UNKNOWN
-4,2000,,,staff@example.com,staff,2014-06-27 16:02:43,TH
+4,2000,,None,staff@example.com,staff,2014-06-27 16:02:43,TH
\ No newline at end of file
--- a/edx/analytics/tasks/tests/acceptance/fixtures/output/expected_financial_report.csv
+++ b/edx/analytics/tasks/tests/acceptance/fixtures/output/expected_financial_report.csv
--- a/edx/analytics/tasks/tests/acceptance/test_course_catalog.py
+++ b/edx/analytics/tasks/tests/acceptance/test_course_catalog.py
@@ -4,6 +4,7 @@ End to end test of the course catalog tasks.
 import logging
 import os
+import datetime
 import pandas
@@ -50,16 +51,24 @@ class CourseSubjectsAcceptanceTest(BaseCourseCatalogAcceptanceTest):
    def validate_output(self):
        """Validates the output, comparing it to a csv of all the expected output from this workflow."""
+        columns = ['row_number', 'course_id', 'date', 'subject_uri', 'subject_title', 'subject_language']
        with self.vertica.cursor() as cursor:
            expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_subjects_for_acceptance.csv')
-            expected = pandas.read_csv(expected_output_csv, parse_dates=True)
+            def convert_date(date_string):
+                """Convert date string to a date object."""
+                return datetime.datetime.strptime(date_string, '%Y-%m-%d').date()
+            expected = pandas.read_csv(expected_output_csv, converters={'date': convert_date})
            cursor.execute("SELECT * FROM {schema}.d_course_subjects;".format(schema=self.vertica.schema_name))
            database_subjects = cursor.fetchall()
-            subjects = pandas.DataFrame(database_subjects, columns=['row_number', 'course_id', 'date', 'subject_uri',
+            subjects = pandas.DataFrame(database_subjects, columns=columns)
-                                                                    'subject_title', 'subject_language'])
+            for frame in (subjects, expected):
+                frame.sort(['row_number'], inplace=True, ascending=[True])
+                frame.reset_index(drop=True, inplace=True)
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
+            self.assert_data_frames_equal(subjects, expected)
-                self.assertTrue(all(subjects == expected))
-            except ValueError:
-                self.fail("Expected and returned data frames have different shapes or labels.")
--- a/edx/analytics/tasks/tests/acceptance/test_financial_reports.py
+++ b/edx/analytics/tasks/tests/acceptance/test_financial_reports.py
@@ -10,7 +10,10 @@ import luigi
 import pandas
 from pandas.util.testing import assert_frame_equal, assert_series_equal
-from edx.analytics.tasks.tests.acceptance import AcceptanceTestCase, when_vertica_available, when_vertica_not_available
+from edx.analytics.tasks.tests.acceptance import (
+    AcceptanceTestCase, when_vertica_available, when_vertica_not_available, coerce_columns_to_string,
+    read_csv_fixture_as_list
+)
 from edx.analytics.tasks.util.url import url_path_join
 from edx.analytics.tasks.warehouse.financial.reconcile import LoadInternalReportingOrderTransactionsToWarehouse
@@ -58,27 +61,24 @@ class FinancialReportsAcceptanceTest(AcceptanceTestCase):
        with self.vertica.cursor() as cursor:
            expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv')
-            expected = pandas.read_csv(expected_output_csv, parse_dates=True)
+            expected_output_data = read_csv_fixture_as_list(expected_output_csv)
+            expected = pandas.DataFrame(expected_output_data, columns=columns)
            cursor.execute("SELECT {columns} FROM {schema}.f_orderitem_transactions".format(
                columns=','.join(columns),
                schema=self.vertica.schema_name
            ))
            response = cursor.fetchall()
-            f_orderitem_transactions = pandas.DataFrame(response, columns=columns)
+            f_orderitem_transactions = pandas.DataFrame(map(coerce_columns_to_string, response), columns=columns)
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
-                self.assertTrue(all(f_orderitem_transactions == expected))
+            for frame in (f_orderitem_transactions, expected):
-            except ValueError:
+                frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False])
-                buf = StringIO()
+                frame.reset_index(drop=True, inplace=True)
-                f_orderitem_transactions.to_csv(buf)
-                print 'Actual:'
+            self.assert_data_frames_equal(f_orderitem_transactions, expected)
-                print buf.getvalue()
-                buf.seek(0)
-                expected.to_csv(buf)
-                print 'Expected:'
-                print buf.getvalue()
-                self.fail("Expected and returned data frames have different shapes or labels.")
    @when_vertica_not_available
    def test_end_to_end_without_vertica(self):

--- a/edx/analytics/tasks/tests/acceptance/test_internal_reporting_certificate.py
+++ b/edx/analytics/tasks/tests/acceptance/test_internal_reporting_certificate.py
@@ -7,7 +7,9 @@ import os
 import pandas
-from edx.analytics.tasks.tests.acceptance import AcceptanceTestCase, when_vertica_available
+from edx.analytics.tasks.tests.acceptance import (
+    AcceptanceTestCase, when_vertica_available, coerce_columns_to_string, read_csv_fixture_as_list
+)
 log = logging.getLogger(__name__)
@@ -33,18 +35,22 @@ class InternalReportingCertificateLoadAcceptanceTest(AcceptanceTestCase):
    def validate_output(self):
        """Validates the output, comparing it to a csv of all the expected output from this workflow."""
+        columns = ['user_id', 'course_id', 'is_certified', 'certificate_mode', 'final_grade', 'has_passed',
+                   'created_date', 'modified_date']
        with self.vertica.cursor() as cursor:
            expected_output_csv = os.path.join(self.data_dir, 'output', 'acceptance_expected_d_user_course_certificate.csv')
-            expected = pandas.read_csv(expected_output_csv, parse_dates=True)
+            expected_output_data = read_csv_fixture_as_list(expected_output_csv)
+            expected = pandas.DataFrame(expected_output_data, columns=columns)
            cursor.execute("SELECT * FROM {schema}.d_user_course_certificate".format(schema=self.vertica.schema_name))
            response = cursor.fetchall()
-            d_user_course_certificate = pandas.DataFrame(response, columns=[
+            d_user_course_certificate = pandas.DataFrame(map(coerce_columns_to_string, response), columns=columns)
-                'user_id', 'course_id', 'is_certified', 'certificate_mode',
-                'final_grade', 'has_passed', 'created_date', 'modified_date',
+            for frame in (d_user_course_certificate, expected):
-            ])
+                frame.sort(['user_id'], inplace=True, ascending=[True])
+                frame.reset_index(drop=True, inplace=True)
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
-                self.assertTrue(all(d_user_course_certificate == expected))
+            self.assert_data_frames_equal(d_user_course_certificate, expected)
-            except ValueError:
-                self.fail("Expected and returned data frames have different shapes or labels.")
--- a/edx/analytics/tasks/tests/acceptance/test_internal_reporting_country.py
+++ b/edx/analytics/tasks/tests/acceptance/test_internal_reporting_country.py
@@ -45,7 +45,8 @@ class InternalReportingCountryLoadAcceptanceTest(AcceptanceTestCase):
            response = cursor.fetchall()
            d_country = pandas.DataFrame(response, columns=['country_name', 'user_last_location_country_code'])
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
+            for frame in (d_country, expected):
-                self.assertTrue(all(d_country == expected))
+                frame.sort(['country_name'], inplace=True, ascending=[True])
-            except ValueError:
+                frame.reset_index(drop=True, inplace=True)
-                self.fail("Expected and returned data frames have different shapes or labels.")
+            self.assert_data_frames_equal(d_country, expected)
--- a/edx/analytics/tasks/tests/acceptance/test_internal_reporting_user.py
+++ b/edx/analytics/tasks/tests/acceptance/test_internal_reporting_user.py
@@ -8,7 +8,9 @@ import os
 import pandas
-from edx.analytics.tasks.tests.acceptance import AcceptanceTestCase, when_vertica_available
+from edx.analytics.tasks.tests.acceptance import (
+    AcceptanceTestCase, when_vertica_available, coerce_columns_to_string, read_csv_fixture_as_list
+)
 log = logging.getLogger(__name__)
@@ -18,8 +20,8 @@ class InternalReportingUserLoadAcceptanceTest(AcceptanceTestCase):
    """End-to-end test of the workflow to load the internal reporting warehouse's user table."""
    INPUT_FILE = 'location_by_course_tracking.log'
-    INTERVAL = '2014-07-21-2014-07-21'
+    INTERVAL = '2014-07-21-2014-07-22'
-    DATE = '2014-07-21'
+    DATE = '2014-07-22'
    def setUp(self):
        super(InternalReportingUserLoadAcceptanceTest, self).setUp()
@@ -51,18 +53,22 @@ class InternalReportingUserLoadAcceptanceTest(AcceptanceTestCase):
    def validate_output(self):
        """Validates the output, comparing it to a csv of all the expected output from this workflow."""
+        columns = ['user_id', 'user_year_of_birth', 'user_level_of_education', 'user_gender', 'user_email',
+                   'user_username', 'user_account_creation_time', 'user_last_location_country_code']
        with self.vertica.cursor() as cursor:
            expected_output_csv = os.path.join(self.data_dir, 'output', 'acceptance_expected_d_user.csv')
-            expected = pandas.read_csv(expected_output_csv, parse_dates=True)
+            expected_output_data = read_csv_fixture_as_list(expected_output_csv)
+            expected = pandas.DataFrame(expected_output_data, columns=columns)
            cursor.execute("SELECT * FROM {schema}.d_user".format(schema=self.vertica.schema_name))
            response = cursor.fetchall()
-            d_user = pandas.DataFrame(response, columns=['user_id', 'user_year_of_birth', 'user_level_of_education',
+            d_user = pandas.DataFrame(map(coerce_columns_to_string, response), columns=columns)
-                                                         'user_gender', 'user_email', 'user_username',
-                                                         'user_account_creation_time',
+            for frame in (d_user, expected):
-                                                         'user_last_location_country_code'])
+                frame.sort(['user_id'], inplace=True, ascending=[True])
+                frame.reset_index(drop=True, inplace=True)
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
-                self.assertTrue(all(d_user == expected))
+            self.assert_data_frames_equal(d_user, expected)
-            except ValueError:
-                self.fail("Expected and returned data frames have different shapes or labels.")
--- a/edx/analytics/tasks/tests/acceptance/test_lms_courseware_link_clicked.py
+++ b/edx/analytics/tasks/tests/acceptance/test_lms_courseware_link_clicked.py
@@ -3,7 +3,7 @@ End-to-end test of the workflow to load the warehouse's lms_courseware_link_clic
 """
-from datetime import date
+import datetime
 import os
 import logging
@@ -23,7 +23,7 @@ class LmsCoursewareLinkClickedAcceptanceTest(AcceptanceTestCase):
    """
    INPUT_FILE = 'lms_courseware_link_clicked_acceptance_tracking.log'
-    DATE = date(2016, 6, 13)
+    DATE = datetime.date(2016, 6, 13)
    @when_vertica_available
    def test_lms_courseware_link_clicked(self):
@@ -46,7 +46,12 @@ class LmsCoursewareLinkClickedAcceptanceTest(AcceptanceTestCase):
                'output',
                'acceptance_expected_lms_courseware_link_clicked_events.csv'
            )
-            expected = pandas.read_csv(expected_output_csv, parse_dates=True)
+            def convert_date(date_string):
+                """Convert date string to a date object."""
+                return datetime.datetime.strptime(date_string, '%Y-%m-%d').date()
+            expected = pandas.read_csv(expected_output_csv, converters={'event_date': convert_date})
            cursor.execute(
                "SELECT * FROM {schema}.lms_courseware_link_clicked_events ORDER BY course_id, event_date"
@@ -65,7 +70,8 @@ class LmsCoursewareLinkClickedAcceptanceTest(AcceptanceTestCase):
                ]
            )
-            try:  # A ValueError will be thrown if the column names don't match or the two data frames are not square.
+            for frame in (lms_courseware_link_clicked_events, expected):
-                self.assertTrue(all(lms_courseware_link_clicked_events == expected))
+                frame.sort(['record_number'], inplace=True, ascending=[True])
-            except ValueError:
+                frame.reset_index(drop=True, inplace=True)
-                self.fail("Expected and returned data frames have different shapes or labels.")
+            self.assert_data_frames_equal(lms_courseware_link_clicked_events, expected)