Encoding in requests from Java

There are Tomcat 8 + servlets, IDE-Eclipse, OS-Debian 9. From the servlets, I send a GET request to the remote server (specifying the encoding in the request. It is assumed that this is the encoding of the page from a remote server), I get the HTML page, parse it. The pages contain Russian and Ukrainian letters. The problem is that even though the answer that came in the Eclipse is read normally, Russian letters are displayed, either???? or completely different ones are already written to the file, terrible krakozyabry.

The problem is observed with two sites:

"http://www.meteo.nw.ru/weather/lo_meteod.php", "windows-1251" (source encoding on the site) , and

"http://meteoinfo.ru/mosobl", "UTF-8"(the source encoding on this site)

Here is a simplified code, trying to recode a small piece of Russian text:

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    // TODO Auto-generated method stub
    response.getWriter().append("Served at: ").append(request.getContextPath());
    response.getWriter().println();

    StringBuffer response1 = null;
    String url = "http://www.meteo.nw.ru/weather/lo_meteod.php";
    String encoding ="windows-1251";
    URL obj = new URL(url );
    HttpURLConnection connection = (HttpURLConnection) obj.openConnection();
    int status =0;
    int connectionCounter = 0;//не больше трех раз
    try {

        response.getWriter().println("Make the connection with Meteod");
        do {
            if(connectionCounter<3){
                connection = (HttpURLConnection) obj.openConnection();
                connection.setRequestMethod("GET"); 
                if (status!= HttpURLConnection.HTTP_OK) status = connection.getResponseCode();//три раза спрашиваем, но до первого положительного ответа
                else break;
            }
        } while (status!= HttpURLConnection.HTTP_OK);//пока не придет положительный ответ от сервера
    }
    catch(ConnectException e) {
        System.out.println("Ошибка соединения ");
        System.out.println(e.toString());
    }
    catch(IllegalArgumentException e) {
        System.out.println("Ошибка IllegalArgumentException ");
        System.out.println(e.toString());
    }
    catch(IOException e) {
        System.out.println("Общая ошибка ");
        System.out.println(e.toString());
    }
    try{    
        if(status== HttpURLConnection.HTTP_OK){//если пришел
            BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), encoding));
            String inputLine;
            response1 = new StringBuffer();
            while ((inputLine = in.readLine()) != null)
                response1.append(inputLine);
            in.close();
        }
        else {System.out.println("Сервер не отвечает по адресу "+url); }
    }
    catch(NullPointerException e) {
        System.out.println("NullPointerException ");
        System.out.println(e.toString());
    }
    catch(IOException e) {
        System.out.println("Общая ошибка ");
        System.out.println(e.toString());
    }

    String answer = response1.toString();
    int ind = answer.indexOf("се");
    answer = answer.substring(ind, ind+20);
    response.getWriter().println("The piece of answer:    "+answer);
    System.out.println("Пример распарсенного ответа:  "+answer);

    byte[] winData = answer.getBytes("Cp1251");

    String string = new String(winData,"Cp1251");

    response.getWriter().println("After recoding :    "+answer);
    System.out.println("После перекодиовки:  "+answer);


    File fileMain = new File("C:" + File.separator + "Users" + File.separator + "Alena" + File.separator + "workspace" + File.separator + "AServlet" + File.separator + "test.txt");
    try {

        if(!fileMain.exists()){//проверяем, что если файл не существует то создаем его
            fileMain.createNewFile();
            response.getWriter().println("create an file "+fileMain.getAbsolutePath());
            System.out.println("Создали файл " + fileMain );
        }

        //BufferedWriter outForMain = new BufferedWriter(new OutputStreamWriter((), "<encoding name>"));
        BufferedWriter outForMain = new BufferedWriter(new FileWriter(fileMain, true));//дописывание в конец документа
        try
        {
            response.getWriter().println("Write in file "+fileMain.getAbsolutePath());
            System.out.println("Пишепм");
                outForMain.append(answer);
                outForMain.newLine();
            } finally {
                outForMain.close();
            }

        FileInputStream fstream = new FileInputStream(fileMain);
           BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
        try{
            response.getWriter().println("Read from file "+fileMain.getAbsolutePath());
               String strLine;
               while ((strLine = br.readLine()) != null){
                   response.getWriter().println("The piece of something we read:    "+strLine);
                  System.out.println("Прочли "+strLine);
               }
        } finally {
            br.close();
        }
            }catch (IOException e){
               System.out.println("Ошибка");
            }
}
Author: MSDN.WhiteKnight, 2017-12-24

1 answers

Your code is beyond good and evil. An application written in this style will be cursed by the first person who tries to fix or change something in it. And this first person, most likely, you yourself. Write simple and short methods, you can use standard design patterns. Try to get rid of the duplicate code and put it in separate methods. This will make the code concise and clear. In addition, at least sometimes look at the documentation and keep track of which ones buns appear with the release of new versions of Java and what the developers of the language itself think about this. For example, the trywithresources construction appeared in version 7 and, according to the documentation, it is strongly recommended to use it. And then there are the lambdas... They can also shorten the code. In total, as far as I understand the problem, you have difficulties with encoding when parsing the specified two sites. I wrote you a simple and clear class. From the main method, you will make a method that accepts some parameters, in depending on your interfaces, and then by simply calling 3 methods, you can parse the site, write it to a text file and read it from a text file. Note that in my code, you don't even need to comment on anything to understand it. And more... In this case, we get just a string variable that contains the html of the page. we also write it to a file, read it and print it to the console. After all, the question says that the problem is with the encoding, my code solves this one the problem. If you need to parse the content, then I strongly recommend using generally accepted libraries. If it's just an HTML page-Jsoup, and if you need JSON - jackson. Note that in the same Jsoup, you can get a standard document for further work in various ways, including from a regular string variable containing an HTML page: Document document = Jsoup.parse(htmlPage). And for the site it is possible to pass the URL or InputStream and it will be a lot more constructive. If you need to subtract HTML from a file, then one of the options already mentioned is to pass InputStream, but in some cases you may need to pass a simple string variable, as I showed. Good luck

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;

class Parser {

    private static final String CHARSET_CP1251 = "Cp1251";
    private static final String CHARSET_UTF8 = "UTF-8";
    private static final String FILE_NAME = "1.txt";

    public static void main(String[] args) throws IOException {

        Parser parser = new Parser();

        String site = parser.parseSite("http://www.meteo.nw.ru/weather/lo_meteod.php", CHARSET_CP1251);
        //String site = parseSite("http://meteoinfo.ru/mosobl", CHARSET_UTF8);

        parser.writeFile(FILE_NAME, site, CHARSET_UTF8);

        String readFile = parser.readFile(FILE_NAME, CHARSET_UTF8);

        System.out.println(readFile);

    }

    private String parseSite (String url, String charset) throws IOException{
        HttpURLConnection connection =(HttpURLConnection)new URL(url).openConnection();
        return toStringByInputStream(new InputStreamReader(connection.getInputStream(), charset));
    }

    private String readFile(String fileName, String charset) throws IOException {
        return toStringByInputStream(new InputStreamReader(new FileInputStream(fileName), charset));
    }

    private String toStringByInputStream (InputStreamReader inputStreamReader) throws IOException {        
        StringBuilder result = new StringBuilder();
        try (BufferedReader bufferedReader = new BufferedReader(inputStreamReader)) {
            bufferedReader.lines().forEach((String line)->result.append(line).append("\r\n"));
        }
        return result.toString();
    }

    private void writeFile (String fileName, String text, String charset) throws IOException{
        try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName), charset))) {
            br.write(text);
            br.flush();
        } 
    }

}
 2
Author: Дмитрий, 2017-12-25 11:07:42