import dataiku

import requests
import gzip
import json
import csv
import random

# URL & filenames to download & create
URL = 'http://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Luxury_Beauty_5.json.gz'
FILE_NAME = 'Luxury_Beauty_5.json.gz'
FILE_UNZIP = 'Luxury_Beauty_5.json'
PROD_CATEGORY = "Luxury Beauty"
SAMPLE_SIZE = 256 # GPU provisioned :fingers-crossed:
SAMPLE_SIZE = 32 # in case no GPU :warning:
DATASET_NAME = "beauty_product_reviews"

response = requests.get(URL)

with open(FILE_NAME, 'wb') as f:
    f.write(response.content)

# Unzip the archive
with gzip.open(FILE_NAME, 'rb') as gz_file:
     with open(FILE_UNZIP, "wb") as f_out:
        f_out.write(gz_file.read())

with open(FILE_UNZIP, "r", encoding="utf-8") as f:
    data = []
    for line in f:
        record = json.loads(line)
        text = record.get("reviewText", "")
        category = PROD_CATEGORY
        sentiment = record.get("overall", "")
        if sentiment in [1, 2]:
            sentiment = "negative"
        elif sentiment == 3:
            sentiment = "neutral"
        elif sentiment in [4, 5]:
            sentiment = "positive"
        data.append({"text": text, "product_category": category, "sentiment": sentiment})
        

# Get a random sample of 1000 records
sample_data = random.sample(data, SAMPLE_SIZE)

# Get the dataset object
dataset = dataiku.Dataset(DATASET_NAME)

# Define the schema for the dataset
schema = [{"name": "text", "type": "string"},
          {"name": "product_category", "type": "string"},
          {"name": "sentiment", "type": "string"}]

# Write the schema to the dataset
dataset.write_schema(schema)

# Write the rows to the dataset
with dataset.get_writer() as writer:
    for row in sample_data:
        writer.write_row_dict(row)